Example #1
0
void greentea_col2im_sk_gpu(viennacl::ocl::program *prog,
                            viennacl::ocl::context *ctx, const cl_mem data_col,
                            const int_tp channels, const int_tp height,
                            const int_tp width, const int_tp patch_h,
                            const int_tp patch_w, const int_tp pad_h,
                            const int_tp pad_w, const int_tp stride_h,
                            const int_tp stride_w, const int_tp kstride_h,
                            const int_tp kstride_w, cl_mem data_im,
                            const int_tp data_offset) {
  if (stride_w > 1 || stride_h > 1 || pad_h > 0 || pad_w > 0) {
    LOG(FATAL)<< "stride greater than 1 or pad greater than 0"
    << " not tested in col2im_sk_gpu().";
  }

  int_tp ext_patch_h = (patch_h - 1) * kstride_h + 1;
  int_tp ext_patch_w = (patch_w - 1) * kstride_w + 1;
  int_tp height_col = (height + 2 * pad_h - ext_patch_h) / stride_h + 1;
  int_tp width_col = (width + 2 * pad_w - ext_patch_w) / stride_w + 1;
  int_tp num_kernels = channels * height * width;

  viennacl::ocl::kernel &kernel = prog->get_kernel(
      CL_KERNEL_SELECT("col2im_sk"));

  viennacl::ocl::enqueue(
      kernel(num_kernels, WrapHandle(data_col, ctx), height, width, channels,
          patch_h, patch_w, ext_patch_h, ext_patch_w,
          pad_h, pad_w, stride_h, stride_w, kstride_h, kstride_w,
          height_col, width_col, WrapHandle(data_im, ctx), data_offset),
      ctx->get_queue());
}
Example #2
0
void greentea_im2col_sk_gpu(viennacl::ocl::program *prog,
                            viennacl::ocl::context *ctx, const cl_mem data_im,
                            const int_tp data_offset, const int_tp channels,
                            const int_tp height, const int_tp width,
                            const int_tp kernel_h, const int_tp kernel_w,
                            const int_tp pad_h, const int_tp pad_w,
                            const int_tp stride_h, const int_tp stride_w,
                            const int_tp kstride_h, const int_tp kstride_w,
                            cl_mem data_col) {
  int_tp ext_kernel_h = (kernel_h - 1) * kstride_h + 1;
  int_tp ext_kernel_w = (kernel_w - 1) * kstride_w + 1;
  int_tp height_col = (height + 2 * pad_h - ext_kernel_h) / stride_h + 1;
  int_tp width_col = (width + 2 * pad_w - ext_kernel_w) / stride_w + 1;
  int_tp num_kernels = channels * height_col * width_col;

  viennacl::ocl::kernel &kernel = prog->get_kernel(
      CL_KERNEL_SELECT("im2col_sk"));

  viennacl::ocl::enqueue(
      kernel(num_kernels, WrapHandle(data_im, ctx), data_offset, height, width,
             kernel_h, kernel_w, ext_kernel_h, ext_kernel_w, pad_h, pad_w,
             stride_h, stride_w, kstride_h, kstride_w, height_col, width_col,
             WrapHandle(data_col, ctx)),
      ctx->get_queue());
}
Example #3
0
void ConvolutionLayerSpatial<Dtype>::swizzleWeights(
    const vector<Blob<Dtype>*>& bottom,
    const vector<Blob<Dtype>*>& top,
    int_tp swizzled_factor) {

  viennacl::ocl::context &ctx = viennacl::ocl::get_context(
      this->device_->id());
  viennacl::ocl::program &program = this->device_->program();
  viennacl::ocl::kernel &oclk_copy_weight = program.get_kernel(
      CL_KERNEL_SELECT("copyWeightsSwizzled"));
  cl_uint argIdx = 0;

  int_tp channels = this->channels_ / this->group_;
  oclk_copy_weight.arg(argIdx++, WrapHandle((cl_mem) weight, &ctx));
  oclk_copy_weight.arg(argIdx++, WrapHandle((cl_mem) swizzled_weights, &ctx));
  oclk_copy_weight.arg(argIdx++, kernel_w_);
  oclk_copy_weight.arg(argIdx++, kernel_h_);
  oclk_copy_weight.arg(argIdx++, channels);
  oclk_copy_weight.arg(argIdx++, this->num_output_);
  oclk_copy_weight.arg(argIdx++, swizzled_factor);
  const size_t global_work_size_Copy[3] = { (size_t) (this->num_output_
      * channels * kernel_w_ * kernel_h_), 1, 1 };

  OCL_CHECK(clEnqueueNDRangeKernel(ctx.get_queue().handle().get(),
                                       oclk_copy_weight.handle().get(), 3, NULL,
                                       global_work_size_Copy, NULL, 0, NULL,
                                       NULL));
}
Example #4
0
void greentea_col2im_ndsk_gpu(viennacl::ocl::program *prog,
                              viennacl::ocl::context *ctx, cl_mem data_col,
                              const int_tp data_col_off,
                              const int_tp num_spatial_axes,
                              const int_tp im_size, cl_mem im_shape,
                              cl_mem col_shape, cl_mem kernel_shape, cl_mem pad,
                              cl_mem stride, cl_mem kstride, cl_mem data_im,
                              int_tp data_off) {
  viennacl::ocl::kernel &kernel = prog->get_kernel(
      CL_KERNEL_SELECT("col2im_ndsk"));

  viennacl::ocl::enqueue(
      kernel(im_size, num_spatial_axes, WrapHandle(data_col, ctx), data_col_off,
             WrapHandle(im_shape, ctx), WrapHandle(col_shape, ctx),
             WrapHandle(kernel_shape, ctx), WrapHandle(pad, ctx),
             WrapHandle(stride, ctx), WrapHandle(kstride, ctx),
             WrapHandle(data_im, ctx), data_off),
      ctx->get_queue());
}
Example #5
0
void greentea_col2im_gpu(viennacl::ocl::program *prog,
                         viennacl::ocl::context *ctx, const cl_mem data_col,
                         const int_tp data_col_off, const int_tp channels,
                         const int_tp height, const int_tp width,
                         const int_tp patch_h, const int_tp patch_w,
                         const int_tp pad_h, const int_tp pad_w,
                         const int_tp stride_h, const int_tp stride_w,
                         cl_mem data_im, const int_tp data_im_off) {
  int_tp height_col = (height + 2 * pad_h - patch_h) / stride_h + 1;
  int_tp width_col = (width + 2 * pad_w - patch_w) / stride_w + 1;
  int_tp num_kernels = channels * height * width;

  viennacl::ocl::kernel &kernel = prog->get_kernel(CL_KERNEL_SELECT("col2im"));

  viennacl::ocl::enqueue(
      kernel(num_kernels, WrapHandle(data_col, ctx), data_col_off, height,
             width, channels, patch_h, patch_w, pad_h, pad_w, stride_h,
             stride_w, height_col, width_col, WrapHandle(data_im, ctx),
             data_im_off),
      ctx->get_queue());
}
Example #6
0
void greentea_im2col_gpu(viennacl::ocl::program *prog,
                         viennacl::ocl::context *ctx, const cl_mem data_im,
                         const int_tp data_im_off, const int_tp channels,
                         const int_tp height, const int_tp width,
                         const int_tp kernel_h, const int_tp kernel_w,
                         const int_tp pad_h, const int_tp pad_w,
                         const int_tp stride_h, const int_tp stride_w,
                         cl_mem data_col, const int_tp data_col_off) {
  // We are going to launch channels * height_col * width_col kernels, each
  // kernel responsible for copying a single-channel grid.
  int_tp height_col = (height + 2 * pad_h - kernel_h) / stride_h + 1;
  int_tp width_col = (width + 2 * pad_w - kernel_w) / stride_w + 1;
  int_tp num_kernels = channels * height_col * width_col;

  viennacl::ocl::kernel &kernel = prog->get_kernel(CL_KERNEL_SELECT("im2col"));

  viennacl::ocl::enqueue(
      kernel(num_kernels, WrapHandle(data_im, ctx), data_im_off, height, width,
             kernel_h, kernel_w, pad_h, pad_w, stride_h, stride_w, height_col,
             width_col, WrapHandle(data_col, ctx), data_col_off),
      ctx->get_queue());
}
Example #7
0
void ConvolutionLayerSpatial<Dtype>::pad_image(
    const vector<Blob<Dtype>*>& bottom,
    const vector<Blob<Dtype>*>& top,
    int_tp image_offset,
    kernelConfig* config,
    int_tp imgNum) {
#ifdef USE_GREENTEA
  viennacl::ocl::context &ctx = viennacl::ocl::get_context(
      this->device_->id());
  // Copy kernel
  viennacl::ocl::program &program = this->device_->program();
  viennacl::ocl::kernel &oclk_copy = program.get_kernel(
                                       CL_KERNEL_SELECT("copyImage"));
  cl_uint argIdx = 0;
  int_tp col_data_offset = 0;
  int_tp channels = this->channels_;

  oclk_copy.arg(argIdx++, WrapHandle((cl_mem) bottom_data, &ctx));
  oclk_copy.arg(argIdx++, image_offset);
  oclk_copy.arg(argIdx++, channels);
  oclk_copy.arg(argIdx++, height_);
  oclk_copy.arg(argIdx++, width_);
  oclk_copy.arg(argIdx++, padded_height_);
  oclk_copy.arg(argIdx++, padded_width_);
  oclk_copy.arg(argIdx++, pad_h_);
  oclk_copy.arg(argIdx++, pad_w_);
  oclk_copy.arg(argIdx++, WrapHandle((cl_mem) col_data, &ctx));
  oclk_copy.arg(argIdx++, col_data_offset);
  oclk_copy.arg(argIdx++, imgNum);
  const size_t global_work_size_Copy[3] = { (size_t) padded_width_,
      (size_t) padded_height_, (size_t) channels };

  clEnqueueNDRangeKernel(ctx.get_queue().handle().get(),
                         oclk_copy.handle().get(), 3, NULL,
                         global_work_size_Copy, NULL, 0, NULL, NULL);
#endif
}
Example #8
0
bool OCL4DNNPool<Dtype>::Forward(const UMat& bottom,
                                 UMat& top,
                                 UMat& top_mask)
{
    bool ret = true;
    size_t global[] = { 128 * 128 };
    size_t local[] = { 128 };

    // support 2D case
    switch (pool_method_)
    {
    case LIBDNN_POOLING_METHOD_MAX:
        {
            bool haveMask = !top_mask.empty();
            ocl::Kernel oclk_max_pool_forward(
                haveMask ? CL_KERNEL_SELECT("max_pool_forward_mask") : CL_KERNEL_SELECT("max_pool_forward"),
                ocl::dnn::ocl4dnn_pooling_oclsrc,
                format("-D KERNEL_MAX_POOL=1 -D KERNEL_W=%d -D KERNEL_H=%d"
                       " -D STRIDE_W=%d -D STRIDE_H=%d"
                       " -D PAD_W=%d -D PAD_H=%d%s",
                       kernel_w_, kernel_h_,
                       stride_w_, stride_h_,
                       pad_w_, pad_h_,
                       haveMask ? " -D HAVE_MASK=1" : ""
                ));

            if (oclk_max_pool_forward.empty())
                return false;

            oclk_max_pool_forward.args(
                count_,
                ocl::KernelArg::PtrReadOnly(bottom),
                channels_,
                height_,
                width_,
                pooled_height_,
                pooled_width_,
                ocl::KernelArg::PtrWriteOnly(top),
                ocl::KernelArg::PtrWriteOnly(top_mask)
            );

            ret = oclk_max_pool_forward.run(1, global, local, false);
        }
        break;
    case LIBDNN_POOLING_METHOD_AVE:
        {
            CV_Assert(top_mask.empty());

            ocl::Kernel oclk_ave_pool_forward(CL_KERNEL_SELECT("ave_pool_forward"),
                ocl::dnn::ocl4dnn_pooling_oclsrc,
                format("-D KERNEL_AVE_POOL=1 -D KERNEL_W=%d -D KERNEL_H=%d"
                       " -D STRIDE_W=%d -D STRIDE_H=%d"
                       " -D PAD_W=%d -D PAD_H=%d%s",
                       kernel_w_, kernel_h_,
                       stride_w_, stride_h_,
                       pad_w_, pad_h_,
                       avePoolPaddedArea ? " -D AVE_POOL_PADDING_AREA" : ""
                ));

            if (oclk_ave_pool_forward.empty())
                return false;

            oclk_ave_pool_forward.args(
                count_,
                ocl::KernelArg::PtrReadOnly(bottom),
                channels_,
                height_,
                width_,
                pooled_height_,
                pooled_width_,
                ocl::KernelArg::PtrWriteOnly(top)
            );

            ret = oclk_ave_pool_forward.run(1, global, local, false);
        }
        break;
    case LIBDNN_POOLING_METHOD_STO:
        {
            CV_Assert(top_mask.empty());

            ocl::Kernel oclk_sto_pool_forward(CL_KERNEL_SELECT("sto_pool_forward_test"),
                ocl::dnn::ocl4dnn_pooling_oclsrc,
                format("-D KERNEL_STO_POOL=1 -D KERNEL_W=%d -D KERNEL_H=%d"
                       " -D STRIDE_W=%d -D STRIDE_H=%d",
                       kernel_w_, kernel_h_,
                       stride_w_, stride_h_
                ));


            if (oclk_sto_pool_forward.empty())
                return false;

            oclk_sto_pool_forward.args(
                count_,
                ocl::KernelArg::PtrReadOnly(bottom),
                channels_,
                height_,
                width_,
                pooled_height_,
                pooled_width_,
                ocl::KernelArg::PtrWriteOnly(top)
            );

            ret = oclk_sto_pool_forward.run(1, global, local, false);
        }
        break;
    default:
        {
            ret = false;
            LOG(FATAL)<< "Unknown pooling method.";
        }
    }
    return ret;
}