void greentea_col2im_sk_gpu(viennacl::ocl::program *prog, viennacl::ocl::context *ctx, const cl_mem data_col, const int_tp channels, const int_tp height, const int_tp width, const int_tp patch_h, const int_tp patch_w, const int_tp pad_h, const int_tp pad_w, const int_tp stride_h, const int_tp stride_w, const int_tp kstride_h, const int_tp kstride_w, cl_mem data_im, const int_tp data_offset) { if (stride_w > 1 || stride_h > 1 || pad_h > 0 || pad_w > 0) { LOG(FATAL)<< "stride greater than 1 or pad greater than 0" << " not tested in col2im_sk_gpu()."; } int_tp ext_patch_h = (patch_h - 1) * kstride_h + 1; int_tp ext_patch_w = (patch_w - 1) * kstride_w + 1; int_tp height_col = (height + 2 * pad_h - ext_patch_h) / stride_h + 1; int_tp width_col = (width + 2 * pad_w - ext_patch_w) / stride_w + 1; int_tp num_kernels = channels * height * width; viennacl::ocl::kernel &kernel = prog->get_kernel( CL_KERNEL_SELECT("col2im_sk")); viennacl::ocl::enqueue( kernel(num_kernels, WrapHandle(data_col, ctx), height, width, channels, patch_h, patch_w, ext_patch_h, ext_patch_w, pad_h, pad_w, stride_h, stride_w, kstride_h, kstride_w, height_col, width_col, WrapHandle(data_im, ctx), data_offset), ctx->get_queue()); }
void greentea_im2col_sk_gpu(viennacl::ocl::program *prog, viennacl::ocl::context *ctx, const cl_mem data_im, const int_tp data_offset, const int_tp channels, const int_tp height, const int_tp width, const int_tp kernel_h, const int_tp kernel_w, const int_tp pad_h, const int_tp pad_w, const int_tp stride_h, const int_tp stride_w, const int_tp kstride_h, const int_tp kstride_w, cl_mem data_col) { int_tp ext_kernel_h = (kernel_h - 1) * kstride_h + 1; int_tp ext_kernel_w = (kernel_w - 1) * kstride_w + 1; int_tp height_col = (height + 2 * pad_h - ext_kernel_h) / stride_h + 1; int_tp width_col = (width + 2 * pad_w - ext_kernel_w) / stride_w + 1; int_tp num_kernels = channels * height_col * width_col; viennacl::ocl::kernel &kernel = prog->get_kernel( CL_KERNEL_SELECT("im2col_sk")); viennacl::ocl::enqueue( kernel(num_kernels, WrapHandle(data_im, ctx), data_offset, height, width, kernel_h, kernel_w, ext_kernel_h, ext_kernel_w, pad_h, pad_w, stride_h, stride_w, kstride_h, kstride_w, height_col, width_col, WrapHandle(data_col, ctx)), ctx->get_queue()); }
void ConvolutionLayerSpatial<Dtype>::swizzleWeights( const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top, int_tp swizzled_factor) { viennacl::ocl::context &ctx = viennacl::ocl::get_context( this->device_->id()); viennacl::ocl::program &program = this->device_->program(); viennacl::ocl::kernel &oclk_copy_weight = program.get_kernel( CL_KERNEL_SELECT("copyWeightsSwizzled")); cl_uint argIdx = 0; int_tp channels = this->channels_ / this->group_; oclk_copy_weight.arg(argIdx++, WrapHandle((cl_mem) weight, &ctx)); oclk_copy_weight.arg(argIdx++, WrapHandle((cl_mem) swizzled_weights, &ctx)); oclk_copy_weight.arg(argIdx++, kernel_w_); oclk_copy_weight.arg(argIdx++, kernel_h_); oclk_copy_weight.arg(argIdx++, channels); oclk_copy_weight.arg(argIdx++, this->num_output_); oclk_copy_weight.arg(argIdx++, swizzled_factor); const size_t global_work_size_Copy[3] = { (size_t) (this->num_output_ * channels * kernel_w_ * kernel_h_), 1, 1 }; OCL_CHECK(clEnqueueNDRangeKernel(ctx.get_queue().handle().get(), oclk_copy_weight.handle().get(), 3, NULL, global_work_size_Copy, NULL, 0, NULL, NULL)); }
void greentea_col2im_ndsk_gpu(viennacl::ocl::program *prog, viennacl::ocl::context *ctx, cl_mem data_col, const int_tp data_col_off, const int_tp num_spatial_axes, const int_tp im_size, cl_mem im_shape, cl_mem col_shape, cl_mem kernel_shape, cl_mem pad, cl_mem stride, cl_mem kstride, cl_mem data_im, int_tp data_off) { viennacl::ocl::kernel &kernel = prog->get_kernel( CL_KERNEL_SELECT("col2im_ndsk")); viennacl::ocl::enqueue( kernel(im_size, num_spatial_axes, WrapHandle(data_col, ctx), data_col_off, WrapHandle(im_shape, ctx), WrapHandle(col_shape, ctx), WrapHandle(kernel_shape, ctx), WrapHandle(pad, ctx), WrapHandle(stride, ctx), WrapHandle(kstride, ctx), WrapHandle(data_im, ctx), data_off), ctx->get_queue()); }
void greentea_col2im_gpu(viennacl::ocl::program *prog, viennacl::ocl::context *ctx, const cl_mem data_col, const int_tp data_col_off, const int_tp channels, const int_tp height, const int_tp width, const int_tp patch_h, const int_tp patch_w, const int_tp pad_h, const int_tp pad_w, const int_tp stride_h, const int_tp stride_w, cl_mem data_im, const int_tp data_im_off) { int_tp height_col = (height + 2 * pad_h - patch_h) / stride_h + 1; int_tp width_col = (width + 2 * pad_w - patch_w) / stride_w + 1; int_tp num_kernels = channels * height * width; viennacl::ocl::kernel &kernel = prog->get_kernel(CL_KERNEL_SELECT("col2im")); viennacl::ocl::enqueue( kernel(num_kernels, WrapHandle(data_col, ctx), data_col_off, height, width, channels, patch_h, patch_w, pad_h, pad_w, stride_h, stride_w, height_col, width_col, WrapHandle(data_im, ctx), data_im_off), ctx->get_queue()); }
void greentea_im2col_gpu(viennacl::ocl::program *prog, viennacl::ocl::context *ctx, const cl_mem data_im, const int_tp data_im_off, const int_tp channels, const int_tp height, const int_tp width, const int_tp kernel_h, const int_tp kernel_w, const int_tp pad_h, const int_tp pad_w, const int_tp stride_h, const int_tp stride_w, cl_mem data_col, const int_tp data_col_off) { // We are going to launch channels * height_col * width_col kernels, each // kernel responsible for copying a single-channel grid. int_tp height_col = (height + 2 * pad_h - kernel_h) / stride_h + 1; int_tp width_col = (width + 2 * pad_w - kernel_w) / stride_w + 1; int_tp num_kernels = channels * height_col * width_col; viennacl::ocl::kernel &kernel = prog->get_kernel(CL_KERNEL_SELECT("im2col")); viennacl::ocl::enqueue( kernel(num_kernels, WrapHandle(data_im, ctx), data_im_off, height, width, kernel_h, kernel_w, pad_h, pad_w, stride_h, stride_w, height_col, width_col, WrapHandle(data_col, ctx), data_col_off), ctx->get_queue()); }
void ConvolutionLayerSpatial<Dtype>::pad_image( const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top, int_tp image_offset, kernelConfig* config, int_tp imgNum) { #ifdef USE_GREENTEA viennacl::ocl::context &ctx = viennacl::ocl::get_context( this->device_->id()); // Copy kernel viennacl::ocl::program &program = this->device_->program(); viennacl::ocl::kernel &oclk_copy = program.get_kernel( CL_KERNEL_SELECT("copyImage")); cl_uint argIdx = 0; int_tp col_data_offset = 0; int_tp channels = this->channels_; oclk_copy.arg(argIdx++, WrapHandle((cl_mem) bottom_data, &ctx)); oclk_copy.arg(argIdx++, image_offset); oclk_copy.arg(argIdx++, channels); oclk_copy.arg(argIdx++, height_); oclk_copy.arg(argIdx++, width_); oclk_copy.arg(argIdx++, padded_height_); oclk_copy.arg(argIdx++, padded_width_); oclk_copy.arg(argIdx++, pad_h_); oclk_copy.arg(argIdx++, pad_w_); oclk_copy.arg(argIdx++, WrapHandle((cl_mem) col_data, &ctx)); oclk_copy.arg(argIdx++, col_data_offset); oclk_copy.arg(argIdx++, imgNum); const size_t global_work_size_Copy[3] = { (size_t) padded_width_, (size_t) padded_height_, (size_t) channels }; clEnqueueNDRangeKernel(ctx.get_queue().handle().get(), oclk_copy.handle().get(), 3, NULL, global_work_size_Copy, NULL, 0, NULL, NULL); #endif }
bool OCL4DNNPool<Dtype>::Forward(const UMat& bottom, UMat& top, UMat& top_mask) { bool ret = true; size_t global[] = { 128 * 128 }; size_t local[] = { 128 }; // support 2D case switch (pool_method_) { case LIBDNN_POOLING_METHOD_MAX: { bool haveMask = !top_mask.empty(); ocl::Kernel oclk_max_pool_forward( haveMask ? CL_KERNEL_SELECT("max_pool_forward_mask") : CL_KERNEL_SELECT("max_pool_forward"), ocl::dnn::ocl4dnn_pooling_oclsrc, format("-D KERNEL_MAX_POOL=1 -D KERNEL_W=%d -D KERNEL_H=%d" " -D STRIDE_W=%d -D STRIDE_H=%d" " -D PAD_W=%d -D PAD_H=%d%s", kernel_w_, kernel_h_, stride_w_, stride_h_, pad_w_, pad_h_, haveMask ? " -D HAVE_MASK=1" : "" )); if (oclk_max_pool_forward.empty()) return false; oclk_max_pool_forward.args( count_, ocl::KernelArg::PtrReadOnly(bottom), channels_, height_, width_, pooled_height_, pooled_width_, ocl::KernelArg::PtrWriteOnly(top), ocl::KernelArg::PtrWriteOnly(top_mask) ); ret = oclk_max_pool_forward.run(1, global, local, false); } break; case LIBDNN_POOLING_METHOD_AVE: { CV_Assert(top_mask.empty()); ocl::Kernel oclk_ave_pool_forward(CL_KERNEL_SELECT("ave_pool_forward"), ocl::dnn::ocl4dnn_pooling_oclsrc, format("-D KERNEL_AVE_POOL=1 -D KERNEL_W=%d -D KERNEL_H=%d" " -D STRIDE_W=%d -D STRIDE_H=%d" " -D PAD_W=%d -D PAD_H=%d%s", kernel_w_, kernel_h_, stride_w_, stride_h_, pad_w_, pad_h_, avePoolPaddedArea ? " -D AVE_POOL_PADDING_AREA" : "" )); if (oclk_ave_pool_forward.empty()) return false; oclk_ave_pool_forward.args( count_, ocl::KernelArg::PtrReadOnly(bottom), channels_, height_, width_, pooled_height_, pooled_width_, ocl::KernelArg::PtrWriteOnly(top) ); ret = oclk_ave_pool_forward.run(1, global, local, false); } break; case LIBDNN_POOLING_METHOD_STO: { CV_Assert(top_mask.empty()); ocl::Kernel oclk_sto_pool_forward(CL_KERNEL_SELECT("sto_pool_forward_test"), ocl::dnn::ocl4dnn_pooling_oclsrc, format("-D KERNEL_STO_POOL=1 -D KERNEL_W=%d -D KERNEL_H=%d" " -D STRIDE_W=%d -D STRIDE_H=%d", kernel_w_, kernel_h_, stride_w_, stride_h_ )); if (oclk_sto_pool_forward.empty()) return false; oclk_sto_pool_forward.args( count_, ocl::KernelArg::PtrReadOnly(bottom), channels_, height_, width_, pooled_height_, pooled_width_, ocl::KernelArg::PtrWriteOnly(top) ); ret = oclk_sto_pool_forward.run(1, global, local, false); } break; default: { ret = false; LOG(FATAL)<< "Unknown pooling method."; } } return ret; }