void SpatialBatchNormalization::init(std::shared_ptr<TorchData> input) { RASSERT(input->type() == TorchDataType::TENSOR_DATA); Tensor<float>* in = TO_TENSOR_PTR(input.get()); Tensor<float>* out = TO_TENSOR_PTR(output.get()); RASSERT(in->dim() >= 3); RASSERT(in->size()[2] == nfeats_); if (output != nullptr && in->dim() != out->dim()) { output = nullptr; } // Check that the input and output size are the same. if (output != nullptr) { if (in->size()[0] != out->size()[0] || in->size()[1] != out->size()[1] || in->size()[2] != out->size()[2]) { output = nullptr; } } if (output == nullptr) { output.reset(new Tensor<float>(in->dim(), in->size())); } }
void SpatialDropout::forwardProp(std::shared_ptr<TorchData> input) { init(input); Tensor<float>::copy(*TO_TENSOR_PTR(output.get()), *TO_TENSOR_PTR(input.get())); Tensor<float>::mul(*TO_TENSOR_PTR(output.get()), 1 - p_); }
void SpatialLPPooling::forwardPropThread(const uint32_t outf) { const uint32_t out_w = TO_TENSOR_PTR(output.get())->size()[0]; const uint32_t out_h = TO_TENSOR_PTR(output.get())->size()[1]; const uint32_t in_w = cur_in_w; const uint32_t in_h = cur_in_h; const float one_over_p_norm = 1.0f / p_norm_; float* out = &output_cpu_[outf * out_w * out_h]; float* in = &input_cpu_[outf * in_w * in_h]; for (uint32_t outv = 0; outv < out_h; outv++) { for (uint32_t outu = 0; outu < out_w; outu++) { uint32_t out_index = outv * out_w + outu; out[out_index] = 0.0f; // Now perform max pooling: for (uint32_t inv = outv * poolsize_v_; inv < (outv + 1) * poolsize_v_; inv++) { for (uint32_t inu = outu * poolsize_u_; inu < (outu + 1) * poolsize_u_; inu++) { float val = fabsf(in[inv * in_w + inu]); out[out_index] += powf(val, p_norm_); } } out[outv * out_w + outu] = powf(out[outv * out_w + outu], one_over_p_norm); } } std::unique_lock<std::mutex> ul(thread_update_lock_); threads_finished_++; not_finished_.notify_all(); // Signify that all threads might have finished ul.unlock(); }
void Threshold::forwardProp(std::shared_ptr<TorchData> input) { init(input); cl_context->useKernelCStr(kThresholdKernel, "Threshold1D"); cl_context->setArg(0, TO_TENSOR_PTR(input.get())->storage()); cl_context->setArg(1, TO_TENSOR_PTR(output.get())->storage()); cl_context->setArg(2, threshold); cl_context->setArg(3, val); uint32_t dim = 1; uint32_t nelem = TO_TENSOR_PTR(output.get())->nelems(); cl_context->runKernel(jtorch::deviceid, dim, &nelem, false); }
void Threshold::init(std::shared_ptr<TorchData> input) { RASSERT(input->type() == TorchDataType::TENSOR_DATA); Tensor<float>* in = TO_TENSOR_PTR(input.get()); if (output != nullptr) { if (!in->isSameSizeAs(*TO_TENSOR_PTR(output.get()))) { // Input dimension has changed! output = nullptr; } } if (output == nullptr) { output.reset(new Tensor<float>(in->dim(), in->size())); } }
void SpatialDropout::init(std::shared_ptr<TorchData> input) { RASSERT(input->type() == TorchDataType::TENSOR_DATA); Tensor<float>* in = TO_TENSOR_PTR(input.get()); if (output != nullptr) { if (!TO_TENSOR_PTR(output.get())->isSameSizeAs(*in)) { output = nullptr; } } if (output == nullptr) { output.reset(Tensor<float>::clone(*in)); } }
void JoinTable::forwardProp(TorchData& input) { init(input); Table& in = (Table&)input; // AT THE MOMENT ONLY JOINS ALONG THE TOP DIMENSION ARE SUPPORTED if (dimension_ != 0) { throw std::runtime_error("JoinTable::forwardProp() - " "Only dimension=0 is supported for now"); } // Copy each table element's raw data into the output std::string kernel = jtorch::jtorch_path + "kernels/join_table.cl"; cl_context->useKernel(kernel.c_str(), "JoinTable1D"); int out_offset = 0; for (uint32_t i = 0; i < in.tableSize(); i++) { Tensor<float>* cur_input = (Tensor<float>*)in(i); cl_context->setArg(0, cur_input->storage()); cl_context->setArg(1, TO_TENSOR_PTR(output)->storage()); cl_context->setArg(2, out_offset); uint32_t dim = 1; uint32_t nelem = cur_input->nelems(); cl_context->runKernel(jtorch::deviceid, dim, &nelem, false); out_offset += nelem; } }
void SpatialLPPooling::forwardProp(std::shared_ptr<TorchData> input) { init(input); Tensor<float>* in = TO_TENSOR_PTR(input.get()); in->getData(input_cpu_.get()); cur_in_w = in->size()[0]; cur_in_h = in->size()[1]; threads_finished_ = 0; for (uint32_t i = 0; i < thread_cbs_.size(); i++) { tp_->addTask(thread_cbs_[i].get()); } // Wait for all threads to finish std::unique_lock<std::mutex> ul(thread_update_lock_); // Get lock while (threads_finished_ != static_cast<int32_t>(thread_cbs_.size())) { not_finished_.wait(ul); } ul.unlock(); // Release lock TO_TENSOR_PTR(output.get())->setData(output_cpu_.get()); }
void SpatialMaxPooling::init(std::shared_ptr<TorchData> input) { RASSERT(input->type() == TorchDataType::TENSOR_DATA); Tensor<float>* in = TO_TENSOR_PTR(input.get()); RASSERT(in->dim() == 2 || in->dim() == 3); // We'll escentially do ceil_mode = false from torch const uint32_t iwidth = in->size()[0]; const uint32_t iheight = in->size()[1]; uint32_t oheight = (long)(floor((float)(iheight - kh_ + 2*padh_) / dh_)) + 1; uint32_t owidth = (long)(floor((float)(iwidth - kw_ + 2*padw_) / dw_)) + 1; if (output != nullptr && TO_TENSOR_PTR(output.get())->dim() != in->dim()) { // Input dimension has changed! output = nullptr; } if (output != nullptr) { // Check that the dimensions above the lowest 2 match for (uint32_t i = 2; i < in->dim() && output != nullptr; i++) { if (TO_TENSOR_PTR(output.get())->size()[i] != in->size()[i]) { output = nullptr; } } } if (output != nullptr) { // Check that the lowest 2 dimensions are the correct size if (TO_TENSOR_PTR(output.get())->size()[0] != owidth || TO_TENSOR_PTR(output.get())->size()[1] != oheight) { output = nullptr; } } if (output == nullptr) { std::unique_ptr<uint32_t[]> out_size(new uint32_t[in->dim()]); out_size[0] = owidth; out_size[1] = oheight; for (uint32_t i = 2; i < in->dim(); i++) { out_size[i] = in->size()[i]; } output.reset(new Tensor<float>(in->dim(), out_size.get())); } }
void SpatialLPPooling::init(std::shared_ptr<TorchData> input) { RASSERT(input->type() == TorchDataType::TENSOR_DATA); Tensor<float>* in = TO_TENSOR_PTR(input.get()); RASSERT(in->dim() == 2 || in->dim() == 3); if (output != nullptr && TO_TENSOR_PTR(output.get())->dim() != in->dim()) { // Input dimension has changed! cleanup(); } if (output != nullptr) { // Check that the dimensions above the lowest 2 match for (uint32_t i = 2; i < in->dim() && output != nullptr; i++) { if (TO_TENSOR_PTR(output.get())->size()[i] != in->size()[i]) { cleanup(); } } } if (output != nullptr) { // Check that the lowest 2 dimensions are the correct size if (TO_TENSOR_PTR(output.get())->size()[0] != in->size()[0] / poolsize_u_ || TO_TENSOR_PTR(output.get())->size()[1] != in->size()[1] / poolsize_v_) { cleanup(); } } if (output == nullptr) { // Check that the width and height is a multiple of the poolsize RASSERT(in->size()[0] % poolsize_u_ == 0 && in->size()[1] % poolsize_v_ == 0); std::unique_ptr<uint32_t[]> out_size(new uint32_t[in->dim()]); out_size[0] = in->size()[0] / poolsize_u_; out_size[1] = in->size()[1] / poolsize_v_; for (uint32_t i = 2; i < in->dim(); i++) { out_size[i] = in->size()[i]; } output.reset(new Tensor<float>(in->dim(), out_size.get())); input_cpu_.reset(new float[in->nelems()]); output_cpu_.reset(new float[TO_TENSOR_PTR(output.get())->nelems()]); } uint32_t n_threads = 1; if (in->dim() > 2) { n_threads = TO_TENSOR_PTR(output.get())->size()[2]; } if (thread_cbs_.size() != n_threads) { thread_cbs_.empty(); for (uint32_t f = 0; f < n_threads; f++) { thread_cbs_.push_back(std::unique_ptr<jcl::threading::Callback<void>>( MakeCallableMany(&SpatialLPPooling::forwardPropThread, this, f))); } } }
void JoinTable::init(std::shared_ptr<TorchData> input) { RASSERT(input->type() == TorchDataType::TABLE_DATA); // Table expected Table* in = TO_TABLE_PTR(input.get()); RASSERT(in->tableSize() > 0); // Check that it is a table of FloatTensors for (uint32_t i = 0; i < in->tableSize(); i++) { // Table of float tensors expected RASSERT((*in)(i)->type() == TENSOR_DATA); } uint32_t dim = TO_TENSOR_PTR((*in)(0).get())->dim(); RASSERT(dim > dimension_); // Otherwise input is smaller than join dimension uint32_t jdim = dim - dimension_ - 1; // dimension_=0 is the top dim // Make sure the dimensions OTHER than the join dimension are all the same for (uint32_t d = 0; d < dim; d++) { if (d != jdim) { for (uint32_t j = 1; j < in->tableSize(); j++) { // sizes must match RASSERT(TO_TENSOR_PTR((*in)(j).get())->size()[d] == TO_TENSOR_PTR((*in)(0).get())->size()[d]); } if (output != nullptr && TO_TENSOR_PTR(output.get())->size()[d] != TO_TENSOR_PTR((*in)(0).get())->size()[d]) { output = nullptr; } } } uint32_t nelems_jdim = 0; for (uint32_t j = 1; j < in->tableSize(); j++) { nelems_jdim += TO_TENSOR_PTR((*in)(j).get())->size()[jdim]; } if (output != nullptr && TO_TENSOR_PTR(output.get())->size()[jdim] != nelems_jdim) { output = nullptr; } if (output == nullptr) { std::unique_ptr<uint32_t[]> size(new uint32_t[dim]); memcpy(size.get(), TO_TENSOR_PTR((*in)(0).get())->size(), sizeof(size[0]) * dim); size[dimension_] = nelems_jdim; output = std::shared_ptr<TorchData>(new Tensor<float>(dim, size.get())); } }
void Reshape::init(std::shared_ptr<TorchData> input) { RASSERT(input->type() == TorchDataType::TENSOR_DATA); Tensor<float>* in = TO_TENSOR_PTR(input.get()); int32_t nelems = outNElem(); static_cast<void>(nelems); // Check the input size. RASSERT(in->nelems() == static_cast<uint32_t>(nelems)); if (output != nullptr) { Tensor<float>* out = TO_TENSOR_PTR(output.get()); if (out->storage() != in->storage()) { // The tensors don't share the same storage! Reinitialize the view. output = nullptr; } } if (output == nullptr) { output = Tensor<float>::view(*in, odim_, osize_.get()); } }
void JoinTable::forwardProp(std::shared_ptr<TorchData> input) { init(input); Table* in = (Table*)input.get(); // AT THE MOMENT ONLY JOINS ALONG THE TOP DIMENSION ARE SUPPORTED RASSERT(dimension_ == 0); // Only dimension=0 is supported for now // Copy each table element's raw data into the output cl_context->useKernelCStr(kJoinTable1DKernel, "JoinTable1D"); int out_offset = 0; for (uint32_t i = 0; i < in->tableSize(); i++) { Tensor<float>* cur_input = TO_TENSOR_PTR((*in)(i).get()); cl_context->setArg(0, cur_input->storage()); cl_context->setArg(1, TO_TENSOR_PTR(output.get())->storage()); cl_context->setArg(2, out_offset); uint32_t dim = 1; uint32_t nelem = cur_input->nelems(); cl_context->runKernel(jtorch::deviceid, dim, &nelem, false); out_offset += nelem; } }
void SpatialConvolution::forwardProp(std::shared_ptr<TorchData> input) { init(input); Tensor<float>* in = TO_TENSOR_PTR(input.get()); if (padding_ > 0) { cl_context->useKernelCStr(kSpatialConvolutionKernel, "SpatialConvolutionPadding"); } else { cl_context->useKernelCStr(kSpatialConvolutionKernel, "SpatialConvolution"); } cl_context->setArg(0, in->storage()); cl_context->setArg(1, TO_TENSOR_PTR(output.get())->storage()); cl_context->setArg(2, weights_->storage()); cl_context->setArg(3, biases_->storage()); cl_context->setArg(4, (int)in->size()[2]); cl_context->setArg(5, (int)in->size()[1]); cl_context->setArg(6, (int)in->size()[0]); cl_context->setArg(7, (int)filt_height_); cl_context->setArg(8, (int)filt_width_); if (padding_ > 0) { cl_context->setArg(9, (int)padding_); } uint32_t dim = 3; cl_context->runKernel(jtorch::deviceid, dim, TO_TENSOR_PTR(output.get())->size(), false); }
void SpatialConvolution::init(std::shared_ptr<TorchData> input) { RASSERT(input->type() == TorchDataType::TENSOR_DATA); Tensor<float>* in = TO_TENSOR_PTR(input.get()); RASSERT(in->dim() == 3); RASSERT(in->size()[2] == feats_in_); if (output != nullptr) { uint32_t owidth = in->size()[0] - filt_width_ + 1 + 2 * padding_; uint32_t oheight = in->size()[1] - filt_height_ + 1 + 2 * padding_; const uint32_t* out_size = TO_TENSOR_PTR(output.get())->size(); if (out_size[0] != owidth || out_size[1] != oheight || out_size[2] != feats_out_) { output = nullptr; } } if (output == nullptr) { uint32_t out_dim[3]; out_dim[0] = in->size()[0] - filt_width_ + 1 + 2 * padding_; out_dim[1] = in->size()[1] - filt_height_ + 1 + 2 * padding_; out_dim[2] = feats_out_; output.reset(new Tensor<float>(3, out_dim)); } }
void Select::forwardProp(std::shared_ptr<TorchData> input) { RASSERT(input->type() == TorchDataType::TENSOR_DATA); // For now we only support the Select operation along the outer dimension. // In torch indexing this is always 1. RASSERT(this->dimension_ == 1); if (src_tensor_ != input.get()) { // Only create the tensor slice if the input has changed. src_tensor_ = TO_TENSOR_PTR(input.get()); // Note the index is torch 1-indexed. output = Tensor<float>::selectOuterDim(*src_tensor_, this->index_ - 1); } }
void SpatialBatchNormalization::forwardProp(std::shared_ptr<TorchData> input) { init(input); Tensor<float>* in = TO_TENSOR_PTR(input.get()); Tensor<float>* out = TO_TENSOR_PTR(output.get()); if (affine_) { cl_context->useKernelCStr(kSpatialBatchNormalizationKernel, "SpatialBatchNormalizationAffine"); } else { cl_context->useKernelCStr(kSpatialBatchNormalizationKernel, "SpatialBatchNormalization"); } cl_context->setArg(0, in->storage()); cl_context->setArg(1, TO_TENSOR_PTR(running_mean_.get())->storage()); cl_context->setArg(2, TO_TENSOR_PTR(running_std_.get())->storage()); cl_context->setArg(3, out->storage()); if (affine_) { cl_context->setArg(4, TO_TENSOR_PTR(weights_.get())->storage()); cl_context->setArg(5, TO_TENSOR_PTR(biases_.get())->storage()); } cl_context->runKernel(jtorch::deviceid, TO_TENSOR_PTR(output.get())->dim(), TO_TENSOR_PTR(output.get())->size(), false); }
void SpatialMaxPooling::forwardProp(std::shared_ptr<TorchData> input) { init(input); bool two_dim = TO_TENSOR_PTR(input.get())->dim() == 2; if (two_dim) { cl_context->useKernelCStr(kSpatialMaxPoolingKernel, "SpatialMaxPooling2D"); } else { cl_context->useKernelCStr(kSpatialMaxPoolingKernel, "SpatialMaxPooling"); } cl_context->setArg(0, TO_TENSOR_PTR(input.get())->storage()); cl_context->setArg(1, TO_TENSOR_PTR(output.get())->storage()); cl_context->setArg(2, (int)TO_TENSOR_PTR(input.get())->size()[1]); cl_context->setArg(3, (int)TO_TENSOR_PTR(input.get())->size()[0]); cl_context->setArg(4, (int)kw_); cl_context->setArg(5, (int)kh_); cl_context->setArg(6, (int)dw_); cl_context->setArg(7, (int)dh_); cl_context->setArg(8, (int)padw_); cl_context->setArg(9, (int)padh_); cl_context->runKernel(jtorch::deviceid, TO_TENSOR_PTR(output.get())->dim(), TO_TENSOR_PTR(output.get())->size(), false); }
void JoinTable::init(TorchData& input) { if (input.type() != TorchDataType::TABLE_DATA) { throw std::runtime_error("JoinTable::forwardProp() - " "Table expected!"); } Table& in = (Table&)input; if (in.tableSize() == 0) { throw std::runtime_error("JoinTable::forwardProp() - " "Empty input Table!"); } // Check that it is a table of FloatTensors for (uint32_t i = 0; i < in.tableSize(); i++) { if (in(i)->type() != TENSOR_DATA) { throw std::runtime_error("JoinTable::forwardProp() - " "Table of float tensors expected!"); } } uint32_t dim = TO_TENSOR_PTR(in(0))->dim(); if (dim <= dimension_) { throw std::runtime_error("JoinTable::forwardProp() - " "Input is smaller than join dimension!"); } uint32_t jdim = dim - dimension_ - 1; // dimension_=0 is the top dim // Make sure the dimensions OTHER than the join dimension are all the same for (uint32_t d = 0; d < dim; d++) { if (d != jdim) { for (uint32_t j = 1; j < in.tableSize(); j++) { if (TO_TENSOR_PTR(in(j))->size()[d] != TO_TENSOR_PTR(in(0))->size()[d]) { throw std::runtime_error("JoinTable::forwardProp() - " "Size mismatch!"); } } if (output != NULL && TO_TENSOR_PTR(output)->size()[d] != TO_TENSOR_PTR(in(0))->size()[d]) { SAFE_DELETE(output); } } } uint32_t nelems_jdim = 0; for (uint32_t j = 1; j < in.tableSize(); j++) { nelems_jdim += TO_TENSOR_PTR(in(j))->size()[jdim]; } if (output != NULL && TO_TENSOR_PTR(output)->size()[jdim] != nelems_jdim) { SAFE_DELETE(output); } if (output == NULL) { uint32_t* size = new uint32_t[dim]; memcpy(size, TO_TENSOR_PTR(in(0))->size(), sizeof(size[0]) * dim); size[dimension_] = nelems_jdim; output = new Tensor<float>(dim, size); SAFE_DELETE_ARR(size); } }
void SpatialDivisiveNormalization::forwardProp( std::shared_ptr<TorchData> input) { init(input); bool onedim_kernel = kernel_->dim() == 1; Tensor<float>* in = TO_TENSOR_PTR(input.get()); Tensor<float>* out = TO_TENSOR_PTR(output.get()); if (onedim_kernel) { int32_t filt_rad = ((int32_t)kernel_norm_->size()[0] - 1) / 2; // Perform horizontal filter pass cl_context->useKernelCStr(kSpatialDivisiveNormalizationKernel, "SpatialDivisiveNormalizationHoriz"); cl_context->setArg(0, in->storage()); cl_context->setArg(1, std_pass1_->storage()); cl_context->setArg(2, kernel_norm_->storage()); cl_context->setArg(3, filt_rad); cl_context->runKernel(jtorch::deviceid, std_pass1_->dim(), std_pass1_->size(), false); // Perform vertical filter pass cl_context->useKernelCStr(kSpatialDivisiveNormalizationKernel, "SpatialDivisiveNormalizationVert"); cl_context->setArg(0, std_pass1_->storage()); cl_context->setArg(1, std_pass2_->storage()); cl_context->setArg(2, kernel_norm_->storage()); cl_context->setArg(3, filt_rad); cl_context->runKernel(jtorch::deviceid, std_pass2_->dim(), std_pass2_->size(), false); } else { int32_t filt_rad_u = ((int32_t)kernel_norm_->size()[0] - 1) / 2; int32_t filt_rad_v = ((int32_t)kernel_norm_->size()[1] - 1) / 2; // Perform vertical filter pass cl_context->useKernelCStr(kSpatialDivisiveNormalizationKernel, "SpatialDivisiveNormalization2D"); cl_context->setArg(0, in->storage()); cl_context->setArg(1, std_pass2_->storage()); cl_context->setArg(2, kernel_norm_->storage()); cl_context->setArg(3, filt_rad_u); cl_context->setArg(4, filt_rad_v); cl_context->runKernel(jtorch::deviceid, std_pass2_->dim(), std_pass2_->size(), false); } // Perform accumulation and division pass cl_context->useKernelCStr(kSpatialDivisiveNormalizationKernel, "SpatialDivisiveNormalizationAccumDiv"); cl_context->setArg(0, std_pass2_->storage()); cl_context->setArg(1, std_->storage()); cl_context->setArg(2, std_coef_->storage()); cl_context->setArg(3, (int)out->size()[2]); cl_context->setArg(4, threshold_); cl_context->runKernel(jtorch::deviceid, std_->dim(), std_->size(), false); // Perform normalization pass cl_context->useKernelCStr(kSpatialDivisiveNormalizationKernel, "SpatialDivisiveNormalization"); cl_context->setArg(0, in->storage()); cl_context->setArg(1, out->storage()); cl_context->setArg(2, std_->storage()); cl_context->runKernel(jtorch::deviceid, out->dim(), out->size(), false); }
void SpatialSubtractiveNormalization::init(TorchData& input) { if (input.type() != TorchDataType::TENSOR_DATA) { throw std::runtime_error("SpatialSubtractiveNormalization::init() - " "FloatTensor expected!"); } Tensor<float>& in = (Tensor<float>&)input; if (in.dim() != 3) { throw std::runtime_error("SpatialDivisiveNormalization::init() - " "3D input is expected!"); } if (output != NULL) { if (!in.isSameSizeAs(*(Tensor<float>*)output)) { // Input dimension has changed! cleanup(); } } if (output == NULL) { output = new Tensor<float>(in.dim(), in.size()); mean_pass1_ = new Tensor<float>(in.dim(), in.size()); mean_pass2_ = new Tensor<float>(in.dim(), in.size()); } if (mean_coef_ == NULL) { uint32_t mean_coeff_size[2]; mean_coeff_size[0] = TO_TENSOR_PTR(output)->size()[0]; mean_coeff_size[1] = TO_TENSOR_PTR(output)->size()[1]; mean_coef_ = new Tensor<float>(2, mean_coeff_size); float* mean_coef_cpu = new float[mean_coef_->nelems()]; float* kernel_cpu = new float[kernel_->nelems()]; kernel_->getData(kernel_cpu); bool onedim_kernel = kernel_->dim() == 1; // Filter an image of all 1 values to create the normalization constants // See norm_test.lua for proof that this works as well as: // https://github.com/andresy/torch/blob/master/extra/nn/SpatialSubtractiveNormalization.lua int32_t n_feats = TO_TENSOR_PTR(output)->size()[2]; int32_t height = TO_TENSOR_PTR(output)->size()[1]; int32_t width = TO_TENSOR_PTR(output)->size()[0]; if (onedim_kernel) { // 1D case - The filter is seperable, but we'll just do the dumb 2D // version since we only do this once on startup. --> O(n * m) uint32_t kernel_size = kernel_->size()[0]; int32_t filt_rad = (kernel_size - 1) / 2; for (int32_t v = 0; v < height; v++) { for (int32_t u = 0; u < width; u++) { float tmp = 0.0f; for (int32_t v_filt = -filt_rad; v_filt <= filt_rad; v_filt++) { for (int32_t u_filt = -filt_rad; u_filt <= filt_rad; u_filt++) { int32_t u_in = u + u_filt; int32_t v_in = v + v_filt; if (u_in >= 0 && u_in < width && v_in >= 0 && v_in < height) { // Pixel is inside --> We'll effectively clamp zeros elsewhere. tmp += (kernel_cpu[v_filt + filt_rad] * kernel_cpu[u_filt + filt_rad]); } } } mean_coef_cpu[v * width + u] = tmp / n_feats; } } } else { // 2D case int32_t kernel_size_u = kernel_->size()[0]; int32_t kernel_size_v = kernel_->size()[1]; int32_t filt_rad_u = (kernel_size_u - 1) / 2; int32_t filt_rad_v = (kernel_size_v - 1) / 2; for (int32_t v = 0; v < height; v++) { for (int32_t u = 0; u < width; u++) { float tmp = 0.0f; for (int32_t v_filt = -filt_rad_v; v_filt <= filt_rad_v; v_filt++) { for (int32_t u_filt = -filt_rad_u; u_filt <= filt_rad_u; u_filt++) { int32_t u_in = u + u_filt; int32_t v_in = v + v_filt; if (u_in >= 0 && u_in < width && v_in >= 0 && v_in < height) { // Pixel is inside --> We'll effectively clamp zeros elsewhere. tmp += kernel_cpu[(v_filt + filt_rad_v) * kernel_size_u + (u_filt + filt_rad_u)]; } } } mean_coef_cpu[v * width + u] = tmp / n_feats; } } } mean_coef_->setData(mean_coef_cpu); delete[] mean_coef_cpu; delete[] kernel_cpu; } if (mean_ == NULL) { uint32_t mean_coeff_size[2]; mean_coeff_size[0] = TO_TENSOR_PTR(output)->size()[0]; mean_coeff_size[1] = TO_TENSOR_PTR(output)->size()[1]; mean_ = new Tensor<float>(2, mean_coeff_size); } }
void SpatialDivisiveNormalization::init(std::shared_ptr<TorchData> input) { RASSERT(input->type() == TorchDataType::TENSOR_DATA); Tensor<float>* in = TO_TENSOR_PTR(input.get()); RASSERT(in->dim() == 3); if (output != nullptr) { if (!in->isSameSizeAs(*TO_TENSOR_PTR(output.get()))) { // Input dimension has changed! cleanup(); } } if (output == nullptr) { output.reset(new Tensor<float>(in->dim(), in->size())); std_pass1_.reset(new Tensor<float>(in->dim(), in->size())); std_pass2_.reset(new Tensor<float>(in->dim(), in->size())); } if (kernel_norm_ == nullptr) { bool onedim_kernel = kernel_->dim() == 1; const float n_feats = (float)in->size()[2]; // Clone and normalize the input kernel kernel_norm_.reset(Tensor<float>::clone(*kernel_)); float sum = Tensor<float>::slowSum(*kernel_norm_); float div_val = onedim_kernel ? (sum * sqrtf(n_feats)) : (sum * n_feats); Tensor<float>::div(*kernel_norm_, div_val); } if (std_coef_ == nullptr) { uint32_t std_coeff_size[2]; std_coeff_size[0] = TO_TENSOR_PTR(output.get())->size()[0]; std_coeff_size[1] = TO_TENSOR_PTR(output.get())->size()[1]; std_coef_.reset(new Tensor<float>(2, std_coeff_size)); std::unique_ptr<float[]> std_coef_cpu(new float[std_coef_->nelems()]); std::unique_ptr<float[]> kernel_norm_cpu(new float[kernel_norm_->nelems()]); kernel_norm_->getData(kernel_norm_cpu.get()); bool onedim_kernel = kernel_->dim() == 1; // Filter an image of all 1 values to create the normalization constants // See norm_test.lua for proof that this works as well as: // https://github.com/andresy/torch/blob/master/extra/nn/SpatialDivisiveNormalization.lua int32_t n_feats = TO_TENSOR_PTR(output.get())->size()[2]; int32_t height = TO_TENSOR_PTR(output.get())->size()[1]; int32_t width = TO_TENSOR_PTR(output.get())->size()[0]; if (onedim_kernel) { // 1D case - The filter is seperable, but we'll just do the dumb 2D // version since we only do this once on startup. --> O(n * m) int32_t kernel_size = kernel_norm_->size()[0]; int32_t filt_rad = (kernel_size - 1) / 2; for (int32_t v = 0; v < height; v++) { for (int32_t u = 0; u < width; u++) { float tmp = 0.0f; for (int32_t v_filt = -filt_rad; v_filt <= filt_rad; v_filt++) { for (int32_t u_filt = -filt_rad; u_filt <= filt_rad; u_filt++) { int32_t u_in = u + u_filt; int32_t v_in = v + v_filt; if (u_in >= 0 && u_in < width && v_in >= 0 && v_in < height) { // Pixel is inside --> We'll effectively clamp zeros elsewhere. tmp += (kernel_norm_cpu[v_filt + filt_rad] * kernel_norm_cpu[u_filt + filt_rad]); } } } std_coef_cpu[v * width + u] = tmp / n_feats; } } } else { // 2D case int32_t kernel_size_u = kernel_norm_->size()[0]; int32_t kernel_size_v = kernel_norm_->size()[1]; int32_t filt_rad_u = (kernel_size_u - 1) / 2; int32_t filt_rad_v = (kernel_size_v - 1) / 2; for (int32_t v = 0; v < height; v++) { for (int32_t u = 0; u < width; u++) { float tmp = 0.0f; for (int32_t v_filt = -filt_rad_v; v_filt <= filt_rad_v; v_filt++) { for (int32_t u_filt = -filt_rad_u; u_filt <= filt_rad_u; u_filt++) { int32_t u_in = u + u_filt; int32_t v_in = v + v_filt; if (u_in >= 0 && u_in < width && v_in >= 0 && v_in < height) { // Pixel is inside --> We'll effectively clamp zeros elsewhere. tmp += kernel_norm_cpu[(v_filt + filt_rad_v) * kernel_size_u + (u_filt + filt_rad_u)]; } } } std_coef_cpu[v * width + u] = tmp / n_feats; } } } std_coef_->setData(std_coef_cpu.get()); } if (std_ == nullptr) { uint32_t std_coeff_size[2]; std_coeff_size[0] = TO_TENSOR_PTR(output.get())->size()[0]; std_coeff_size[1] = TO_TENSOR_PTR(output.get())->size()[1]; std_.reset(new Tensor<float>(2, std_coeff_size)); } }