コード例 #1
0
ファイル: greentea_im2col.cpp プロジェクト: victorv/caffe
void greentea_im2col_sk_gpu(viennacl::ocl::program *prog,
                            viennacl::ocl::context *ctx, const cl_mem data_im,
                            const int_tp data_offset, const int_tp channels,
                            const int_tp height, const int_tp width,
                            const int_tp kernel_h, const int_tp kernel_w,
                            const int_tp pad_h, const int_tp pad_w,
                            const int_tp stride_h, const int_tp stride_w,
                            const int_tp kstride_h, const int_tp kstride_w,
                            cl_mem data_col) {
  int_tp ext_kernel_h = (kernel_h - 1) * kstride_h + 1;
  int_tp ext_kernel_w = (kernel_w - 1) * kstride_w + 1;
  int_tp height_col = (height + 2 * pad_h - ext_kernel_h) / stride_h + 1;
  int_tp width_col = (width + 2 * pad_w - ext_kernel_w) / stride_w + 1;
  int_tp num_kernels = channels * height_col * width_col;

  viennacl::ocl::kernel &kernel = prog->get_kernel(
      CL_KERNEL_SELECT("im2col_sk"));

  viennacl::ocl::enqueue(
      kernel(num_kernels, WrapHandle(data_im, ctx), data_offset, height, width,
             kernel_h, kernel_w, ext_kernel_h, ext_kernel_w, pad_h, pad_w,
             stride_h, stride_w, kstride_h, kstride_w, height_col, width_col,
             WrapHandle(data_col, ctx)),
      ctx->get_queue());
}
コード例 #2
0
ファイル: greentea_im2col.cpp プロジェクト: victorv/caffe
void greentea_col2im_sk_gpu(viennacl::ocl::program *prog,
                            viennacl::ocl::context *ctx, const cl_mem data_col,
                            const int_tp channels, const int_tp height,
                            const int_tp width, const int_tp patch_h,
                            const int_tp patch_w, const int_tp pad_h,
                            const int_tp pad_w, const int_tp stride_h,
                            const int_tp stride_w, const int_tp kstride_h,
                            const int_tp kstride_w, cl_mem data_im,
                            const int_tp data_offset) {
  if (stride_w > 1 || stride_h > 1 || pad_h > 0 || pad_w > 0) {
    LOG(FATAL)<< "stride greater than 1 or pad greater than 0"
    << " not tested in col2im_sk_gpu().";
  }

  int_tp ext_patch_h = (patch_h - 1) * kstride_h + 1;
  int_tp ext_patch_w = (patch_w - 1) * kstride_w + 1;
  int_tp height_col = (height + 2 * pad_h - ext_patch_h) / stride_h + 1;
  int_tp width_col = (width + 2 * pad_w - ext_patch_w) / stride_w + 1;
  int_tp num_kernels = channels * height * width;

  viennacl::ocl::kernel &kernel = prog->get_kernel(
      CL_KERNEL_SELECT("col2im_sk"));

  viennacl::ocl::enqueue(
      kernel(num_kernels, WrapHandle(data_col, ctx), height, width, channels,
          patch_h, patch_w, ext_patch_h, ext_patch_w,
          pad_h, pad_w, stride_h, stride_w, kstride_h, kstride_w,
          height_col, width_col, WrapHandle(data_im, ctx), data_offset),
      ctx->get_queue());
}
コード例 #3
0
ファイル: conv_layer_spatial.cpp プロジェクト: torms3/caffe
void ConvolutionLayerSpatial<Dtype>::swizzleWeights(
    const vector<Blob<Dtype>*>& bottom,
    const vector<Blob<Dtype>*>& top,
    int_tp swizzled_factor) {

  viennacl::ocl::context &ctx = viennacl::ocl::get_context(
      this->device_->id());
  viennacl::ocl::program &program = this->device_->program();
  viennacl::ocl::kernel &oclk_copy_weight = program.get_kernel(
      CL_KERNEL_SELECT("copyWeightsSwizzled"));
  cl_uint argIdx = 0;

  int_tp channels = this->channels_ / this->group_;
  oclk_copy_weight.arg(argIdx++, WrapHandle((cl_mem) weight, &ctx));
  oclk_copy_weight.arg(argIdx++, WrapHandle((cl_mem) swizzled_weights, &ctx));
  oclk_copy_weight.arg(argIdx++, kernel_w_);
  oclk_copy_weight.arg(argIdx++, kernel_h_);
  oclk_copy_weight.arg(argIdx++, channels);
  oclk_copy_weight.arg(argIdx++, this->num_output_);
  oclk_copy_weight.arg(argIdx++, swizzled_factor);
  const size_t global_work_size_Copy[3] = { (size_t) (this->num_output_
      * channels * kernel_w_ * kernel_h_), 1, 1 };

  OCL_CHECK(clEnqueueNDRangeKernel(ctx.get_queue().handle().get(),
                                       oclk_copy_weight.handle().get(), 3, NULL,
                                       global_work_size_Copy, NULL, 0, NULL,
                                       NULL));
}
コード例 #4
0
ファイル: greentea_im2col.cpp プロジェクト: victorv/caffe
void greentea_col2im_gpu(viennacl::ocl::program *prog,
                         viennacl::ocl::context *ctx, const cl_mem data_col,
                         const int_tp data_col_off, const int_tp channels,
                         const int_tp height, const int_tp width,
                         const int_tp patch_h, const int_tp patch_w,
                         const int_tp pad_h, const int_tp pad_w,
                         const int_tp stride_h, const int_tp stride_w,
                         cl_mem data_im, const int_tp data_im_off) {
  int_tp height_col = (height + 2 * pad_h - patch_h) / stride_h + 1;
  int_tp width_col = (width + 2 * pad_w - patch_w) / stride_w + 1;
  int_tp num_kernels = channels * height * width;

  viennacl::ocl::kernel &kernel = prog->get_kernel(CL_KERNEL_SELECT("col2im"));

  viennacl::ocl::enqueue(
      kernel(num_kernels, WrapHandle(data_col, ctx), data_col_off, height,
             width, channels, patch_h, patch_w, pad_h, pad_w, stride_h,
             stride_w, height_col, width_col, WrapHandle(data_im, ctx),
             data_im_off),
      ctx->get_queue());
}
コード例 #5
0
ファイル: greentea_im2col.cpp プロジェクト: victorv/caffe
void greentea_im2col_gpu(viennacl::ocl::program *prog,
                         viennacl::ocl::context *ctx, const cl_mem data_im,
                         const int_tp data_im_off, const int_tp channels,
                         const int_tp height, const int_tp width,
                         const int_tp kernel_h, const int_tp kernel_w,
                         const int_tp pad_h, const int_tp pad_w,
                         const int_tp stride_h, const int_tp stride_w,
                         cl_mem data_col, const int_tp data_col_off) {
  // We are going to launch channels * height_col * width_col kernels, each
  // kernel responsible for copying a single-channel grid.
  int_tp height_col = (height + 2 * pad_h - kernel_h) / stride_h + 1;
  int_tp width_col = (width + 2 * pad_w - kernel_w) / stride_w + 1;
  int_tp num_kernels = channels * height_col * width_col;

  viennacl::ocl::kernel &kernel = prog->get_kernel(CL_KERNEL_SELECT("im2col"));

  viennacl::ocl::enqueue(
      kernel(num_kernels, WrapHandle(data_im, ctx), data_im_off, height, width,
             kernel_h, kernel_w, pad_h, pad_w, stride_h, stride_w, height_col,
             width_col, WrapHandle(data_col, ctx), data_col_off),
      ctx->get_queue());
}
コード例 #6
0
ファイル: conv_layer_spatial.cpp プロジェクト: torms3/caffe
void ConvolutionLayerSpatial<Dtype>::setBufferKernelArg(
    const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top,
    viennacl::ocl::kernel *kernel,
    const cl_uint &argIdx,
    viennacl::ocl::context *ctx,
    cl_mem buffer, size_t offset,
    size_t size, bool readOnly,
    bool preserved) {

  if (offset == 0) {
    kernel->arg(argIdx, WrapHandle((cl_mem) buffer, ctx));
    return;
  }

  if (preserved &&
    subBufferMap.find(std::make_tuple(buffer, offset, size))
      != subBufferMap.end()) {
    kernel->arg(argIdx,
      WrapHandle(subBufferMap.find
                   (std::make_tuple(buffer, offset, size))->second, ctx));
    return;
  }
  cl_buffer_region region;
  region.origin = offset * sizeof(Dtype);
  region.size = size * sizeof(Dtype);
  cl_mem_flags memFlags = readOnly ? CL_MEM_READ_ONLY : CL_MEM_READ_WRITE;
  cl_int error;
  cl_mem sub_buffer = clCreateSubBuffer(buffer, memFlags,
                        CL_BUFFER_CREATE_TYPE_REGION,
                        &region, &error);
  CHECK_EQ(error, CL_SUCCESS) << "Failed to create sub buffer." << std::endl;
  kernel->arg(argIdx, WrapHandle(sub_buffer, ctx));
  if (preserved)
    subBufferMap.insert(std::make_pair(std::make_tuple(buffer, offset, size),
                        sub_buffer));
  else
    tmpSubBuffers.push_back(sub_buffer);
}
コード例 #7
0
ファイル: conv_layer_spatial.cpp プロジェクト: torms3/caffe
void ConvolutionLayerSpatial<Dtype>::pad_image(
    const vector<Blob<Dtype>*>& bottom,
    const vector<Blob<Dtype>*>& top,
    int_tp image_offset,
    kernelConfig* config,
    int_tp imgNum) {
#ifdef USE_GREENTEA
  viennacl::ocl::context &ctx = viennacl::ocl::get_context(
      this->device_->id());
  // Copy kernel
  viennacl::ocl::program &program = this->device_->program();
  viennacl::ocl::kernel &oclk_copy = program.get_kernel(
                                       CL_KERNEL_SELECT("copyImage"));
  cl_uint argIdx = 0;
  int_tp col_data_offset = 0;
  int_tp channels = this->channels_;

  oclk_copy.arg(argIdx++, WrapHandle((cl_mem) bottom_data, &ctx));
  oclk_copy.arg(argIdx++, image_offset);
  oclk_copy.arg(argIdx++, channels);
  oclk_copy.arg(argIdx++, height_);
  oclk_copy.arg(argIdx++, width_);
  oclk_copy.arg(argIdx++, padded_height_);
  oclk_copy.arg(argIdx++, padded_width_);
  oclk_copy.arg(argIdx++, pad_h_);
  oclk_copy.arg(argIdx++, pad_w_);
  oclk_copy.arg(argIdx++, WrapHandle((cl_mem) col_data, &ctx));
  oclk_copy.arg(argIdx++, col_data_offset);
  oclk_copy.arg(argIdx++, imgNum);
  const size_t global_work_size_Copy[3] = { (size_t) padded_width_,
      (size_t) padded_height_, (size_t) channels };

  clEnqueueNDRangeKernel(ctx.get_queue().handle().get(),
                         oclk_copy.handle().get(), 3, NULL,
                         global_work_size_Copy, NULL, 0, NULL, NULL);
#endif
}
コード例 #8
0
ファイル: greentea_im2col.cpp プロジェクト: victorv/caffe
void greentea_col2im_ndsk_gpu(viennacl::ocl::program *prog,
                              viennacl::ocl::context *ctx, cl_mem data_col,
                              const int_tp data_col_off,
                              const int_tp num_spatial_axes,
                              const int_tp im_size, cl_mem im_shape,
                              cl_mem col_shape, cl_mem kernel_shape, cl_mem pad,
                              cl_mem stride, cl_mem kstride, cl_mem data_im,
                              int_tp data_off) {
  viennacl::ocl::kernel &kernel = prog->get_kernel(
      CL_KERNEL_SELECT("col2im_ndsk"));

  viennacl::ocl::enqueue(
      kernel(im_size, num_spatial_axes, WrapHandle(data_col, ctx), data_col_off,
             WrapHandle(im_shape, ctx), WrapHandle(col_shape, ctx),
             WrapHandle(kernel_shape, ctx), WrapHandle(pad, ctx),
             WrapHandle(stride, ctx), WrapHandle(kstride, ctx),
             WrapHandle(data_im, ctx), data_off),
      ctx->get_queue());
}
コード例 #9
0
ファイル: conv_layer_spatial.cpp プロジェクト: torms3/caffe
cl_int ConvolutionLayerSpatial<float>::convolve(
    const vector<Blob<float>*>& bottom, const vector<Blob<float>*>& top,
    int_tp index,
    int_tp numImages, kernelConfig* config) {

  viennacl::ocl::context &ctx = viennacl::ocl::get_context(this->device_->id());
  viennacl::ocl::program & program = ctx.get_program(config->kernelName);
  viennacl::ocl::kernel &kernel = program.get_kernel(config->kernelName);
  cl_int err = 0;

  if (config->kernelType != 2) {
    for (int_tp n = 0; n < numImages; ++n) {
      for (int_tp g = 0; g < group_; ++g) {
        bias_offset_ = M_ * g;
        int_tp image_offset = n * this->bottom_dim_
            + width_ * height_ * (channels_ / group_) * g;
        int_tp output_image_offset = n * this->top_dim_
            + output_w_ * output_h_ * M_ * g;

        cl_uint argIdx = 0;
        int_tp kernel_offset = kernel_h_ * kernel_w_ * (channels_ / group_) * M_
            * g;

        // Copy image
        if (pad_w_ > 0 || pad_h_ > 0) {
          pad_image(bottom, top, image_offset, config, numImages);
          image_offset = 0;
          kernel.arg(argIdx++, WrapHandle((cl_mem) col_data, &ctx));
        } else {
          kernel.arg(argIdx++, WrapHandle((cl_mem) bottom_data, &ctx));
        }
        kernel.arg(argIdx++, image_offset);
        kernel.arg(argIdx++, WrapHandle((cl_mem) weight, &ctx));
        kernel.arg(argIdx++, kernel_offset);
        kernel.arg(argIdx++, WrapHandle((cl_mem) bias_, &ctx));
        kernel.arg(argIdx++, bias_offset_);
        kernel.arg(argIdx++, WrapHandle((cl_mem) top_data, &ctx));
        kernel.arg(argIdx++, output_image_offset);
        kernel.arg(argIdx++, (uint16_t)padded_width_);
        kernel.arg(argIdx++, (uint16_t)padded_height_);
        kernel.arg(argIdx++, (uint16_t)output_w_);
        kernel.arg(argIdx++, (uint16_t)output_h_);
        if (config->use_null_local) {
          err = clEnqueueNDRangeKernel(ctx.get_queue().handle().get(),
                                       kernel.handle().get(), 3,
                                       NULL,
                                       config->global_work_size, NULL, 0, NULL,
                                       NULL);
        } else {
          err = clEnqueueNDRangeKernel(ctx.get_queue().handle().get(),
                                       kernel.handle().get(), 3,
                                       NULL,
                                       config->global_work_size,
                                       config->local_work_size, 0, NULL,
                                       NULL);
        }

        if (err != CL_SUCCESS)
          return err;
        viennacl::backend::finish();
      }
    }
  } else {
    swizzleWeights(bottom, top, 16);
    size_t total_bottom_size = bottom_dim_ * numImages;
    size_t total_kernel_size = kernel_h_ * kernel_w_ * channels_ * M_;
    size_t total_bias_size = M_ * group_;
    size_t total_top_size = top_dim_ * numImages;
    for (int_tp g = 0; g < group_; ++g) {
      bias_offset_ = M_ * g;
      int_tp image_offset = width_ * height_ * (channels_ / group_) * g;
      int_tp output_image_offset = output_w_ * output_h_ * M_ * g;

      cl_uint argIdx = 0;
      int_tp kernel_offset = kernel_h_ * kernel_w_
                             * (channels_ / group_) * M_ * g;
      // Copy image
      cl_mem input_image;
      if (pad_w_ > 0 || pad_h_ > 0) {
        pad_image(bottom, top, image_offset, config, numImages);
        image_offset = 0;
        input_image = (cl_mem) col_data;
      } else {
        input_image = (cl_mem) bottom_data;
      }
      setBufferKernelArg(bottom, top, &kernel, argIdx++, &ctx, input_image,
                         image_offset, total_bottom_size - image_offset,
                         true, false);
      setBufferKernelArg(bottom, top, &kernel, argIdx++, &ctx,
                         (cl_mem) swizzled_weights,
                         kernel_offset, total_kernel_size - kernel_offset,
                         true, true);
      setBufferKernelArg(bottom, top, &kernel, argIdx++, &ctx, (cl_mem) bias_,
                         bias_offset_, total_bias_size - bias_offset_,
                         true, true);
      setBufferKernelArg(bottom, top, &kernel, argIdx++, &ctx,
                         (cl_mem) top_data,
                         output_image_offset,
                         total_top_size - output_image_offset,
                         false, false);
      kernel.arg(argIdx++, (uint16_t)padded_width_);
      kernel.arg(argIdx++, (uint16_t)padded_height_);
      kernel.arg(argIdx++, (uint16_t)output_w_);
      kernel.arg(argIdx++, (uint16_t)output_h_);
      err = clEnqueueNDRangeKernel(ctx.get_queue().handle().get(),
                                   kernel.handle().get(), 3,
                                   NULL,
                                   config->global_work_size,
                                   config->local_work_size, 0, NULL,
                                   NULL);
      if (err != CL_SUCCESS)
        return err;
      viennacl::backend::finish();
    }

    if (group_ > 1) {
      viennacl::backend::finish();
      cleanTmpSubBuffers(bottom, top);
    }
  }

  return err;
}