void greentea_im2col_sk_gpu(viennacl::ocl::program *prog, viennacl::ocl::context *ctx, const cl_mem data_im, const int_tp data_offset, const int_tp channels, const int_tp height, const int_tp width, const int_tp kernel_h, const int_tp kernel_w, const int_tp pad_h, const int_tp pad_w, const int_tp stride_h, const int_tp stride_w, const int_tp kstride_h, const int_tp kstride_w, cl_mem data_col) { int_tp ext_kernel_h = (kernel_h - 1) * kstride_h + 1; int_tp ext_kernel_w = (kernel_w - 1) * kstride_w + 1; int_tp height_col = (height + 2 * pad_h - ext_kernel_h) / stride_h + 1; int_tp width_col = (width + 2 * pad_w - ext_kernel_w) / stride_w + 1; int_tp num_kernels = channels * height_col * width_col; viennacl::ocl::kernel &kernel = prog->get_kernel( CL_KERNEL_SELECT("im2col_sk")); viennacl::ocl::enqueue( kernel(num_kernels, WrapHandle(data_im, ctx), data_offset, height, width, kernel_h, kernel_w, ext_kernel_h, ext_kernel_w, pad_h, pad_w, stride_h, stride_w, kstride_h, kstride_w, height_col, width_col, WrapHandle(data_col, ctx)), ctx->get_queue()); }
void greentea_col2im_sk_gpu(viennacl::ocl::program *prog, viennacl::ocl::context *ctx, const cl_mem data_col, const int_tp channels, const int_tp height, const int_tp width, const int_tp patch_h, const int_tp patch_w, const int_tp pad_h, const int_tp pad_w, const int_tp stride_h, const int_tp stride_w, const int_tp kstride_h, const int_tp kstride_w, cl_mem data_im, const int_tp data_offset) { if (stride_w > 1 || stride_h > 1 || pad_h > 0 || pad_w > 0) { LOG(FATAL)<< "stride greater than 1 or pad greater than 0" << " not tested in col2im_sk_gpu()."; } int_tp ext_patch_h = (patch_h - 1) * kstride_h + 1; int_tp ext_patch_w = (patch_w - 1) * kstride_w + 1; int_tp height_col = (height + 2 * pad_h - ext_patch_h) / stride_h + 1; int_tp width_col = (width + 2 * pad_w - ext_patch_w) / stride_w + 1; int_tp num_kernels = channels * height * width; viennacl::ocl::kernel &kernel = prog->get_kernel( CL_KERNEL_SELECT("col2im_sk")); viennacl::ocl::enqueue( kernel(num_kernels, WrapHandle(data_col, ctx), height, width, channels, patch_h, patch_w, ext_patch_h, ext_patch_w, pad_h, pad_w, stride_h, stride_w, kstride_h, kstride_w, height_col, width_col, WrapHandle(data_im, ctx), data_offset), ctx->get_queue()); }
void ConvolutionLayerSpatial<Dtype>::swizzleWeights( const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top, int_tp swizzled_factor) { viennacl::ocl::context &ctx = viennacl::ocl::get_context( this->device_->id()); viennacl::ocl::program &program = this->device_->program(); viennacl::ocl::kernel &oclk_copy_weight = program.get_kernel( CL_KERNEL_SELECT("copyWeightsSwizzled")); cl_uint argIdx = 0; int_tp channels = this->channels_ / this->group_; oclk_copy_weight.arg(argIdx++, WrapHandle((cl_mem) weight, &ctx)); oclk_copy_weight.arg(argIdx++, WrapHandle((cl_mem) swizzled_weights, &ctx)); oclk_copy_weight.arg(argIdx++, kernel_w_); oclk_copy_weight.arg(argIdx++, kernel_h_); oclk_copy_weight.arg(argIdx++, channels); oclk_copy_weight.arg(argIdx++, this->num_output_); oclk_copy_weight.arg(argIdx++, swizzled_factor); const size_t global_work_size_Copy[3] = { (size_t) (this->num_output_ * channels * kernel_w_ * kernel_h_), 1, 1 }; OCL_CHECK(clEnqueueNDRangeKernel(ctx.get_queue().handle().get(), oclk_copy_weight.handle().get(), 3, NULL, global_work_size_Copy, NULL, 0, NULL, NULL)); }
void greentea_col2im_gpu(viennacl::ocl::program *prog, viennacl::ocl::context *ctx, const cl_mem data_col, const int_tp data_col_off, const int_tp channels, const int_tp height, const int_tp width, const int_tp patch_h, const int_tp patch_w, const int_tp pad_h, const int_tp pad_w, const int_tp stride_h, const int_tp stride_w, cl_mem data_im, const int_tp data_im_off) { int_tp height_col = (height + 2 * pad_h - patch_h) / stride_h + 1; int_tp width_col = (width + 2 * pad_w - patch_w) / stride_w + 1; int_tp num_kernels = channels * height * width; viennacl::ocl::kernel &kernel = prog->get_kernel(CL_KERNEL_SELECT("col2im")); viennacl::ocl::enqueue( kernel(num_kernels, WrapHandle(data_col, ctx), data_col_off, height, width, channels, patch_h, patch_w, pad_h, pad_w, stride_h, stride_w, height_col, width_col, WrapHandle(data_im, ctx), data_im_off), ctx->get_queue()); }
void greentea_im2col_gpu(viennacl::ocl::program *prog, viennacl::ocl::context *ctx, const cl_mem data_im, const int_tp data_im_off, const int_tp channels, const int_tp height, const int_tp width, const int_tp kernel_h, const int_tp kernel_w, const int_tp pad_h, const int_tp pad_w, const int_tp stride_h, const int_tp stride_w, cl_mem data_col, const int_tp data_col_off) { // We are going to launch channels * height_col * width_col kernels, each // kernel responsible for copying a single-channel grid. int_tp height_col = (height + 2 * pad_h - kernel_h) / stride_h + 1; int_tp width_col = (width + 2 * pad_w - kernel_w) / stride_w + 1; int_tp num_kernels = channels * height_col * width_col; viennacl::ocl::kernel &kernel = prog->get_kernel(CL_KERNEL_SELECT("im2col")); viennacl::ocl::enqueue( kernel(num_kernels, WrapHandle(data_im, ctx), data_im_off, height, width, kernel_h, kernel_w, pad_h, pad_w, stride_h, stride_w, height_col, width_col, WrapHandle(data_col, ctx), data_col_off), ctx->get_queue()); }
void ConvolutionLayerSpatial<Dtype>::setBufferKernelArg( const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top, viennacl::ocl::kernel *kernel, const cl_uint &argIdx, viennacl::ocl::context *ctx, cl_mem buffer, size_t offset, size_t size, bool readOnly, bool preserved) { if (offset == 0) { kernel->arg(argIdx, WrapHandle((cl_mem) buffer, ctx)); return; } if (preserved && subBufferMap.find(std::make_tuple(buffer, offset, size)) != subBufferMap.end()) { kernel->arg(argIdx, WrapHandle(subBufferMap.find (std::make_tuple(buffer, offset, size))->second, ctx)); return; } cl_buffer_region region; region.origin = offset * sizeof(Dtype); region.size = size * sizeof(Dtype); cl_mem_flags memFlags = readOnly ? CL_MEM_READ_ONLY : CL_MEM_READ_WRITE; cl_int error; cl_mem sub_buffer = clCreateSubBuffer(buffer, memFlags, CL_BUFFER_CREATE_TYPE_REGION, ®ion, &error); CHECK_EQ(error, CL_SUCCESS) << "Failed to create sub buffer." << std::endl; kernel->arg(argIdx, WrapHandle(sub_buffer, ctx)); if (preserved) subBufferMap.insert(std::make_pair(std::make_tuple(buffer, offset, size), sub_buffer)); else tmpSubBuffers.push_back(sub_buffer); }
void ConvolutionLayerSpatial<Dtype>::pad_image( const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top, int_tp image_offset, kernelConfig* config, int_tp imgNum) { #ifdef USE_GREENTEA viennacl::ocl::context &ctx = viennacl::ocl::get_context( this->device_->id()); // Copy kernel viennacl::ocl::program &program = this->device_->program(); viennacl::ocl::kernel &oclk_copy = program.get_kernel( CL_KERNEL_SELECT("copyImage")); cl_uint argIdx = 0; int_tp col_data_offset = 0; int_tp channels = this->channels_; oclk_copy.arg(argIdx++, WrapHandle((cl_mem) bottom_data, &ctx)); oclk_copy.arg(argIdx++, image_offset); oclk_copy.arg(argIdx++, channels); oclk_copy.arg(argIdx++, height_); oclk_copy.arg(argIdx++, width_); oclk_copy.arg(argIdx++, padded_height_); oclk_copy.arg(argIdx++, padded_width_); oclk_copy.arg(argIdx++, pad_h_); oclk_copy.arg(argIdx++, pad_w_); oclk_copy.arg(argIdx++, WrapHandle((cl_mem) col_data, &ctx)); oclk_copy.arg(argIdx++, col_data_offset); oclk_copy.arg(argIdx++, imgNum); const size_t global_work_size_Copy[3] = { (size_t) padded_width_, (size_t) padded_height_, (size_t) channels }; clEnqueueNDRangeKernel(ctx.get_queue().handle().get(), oclk_copy.handle().get(), 3, NULL, global_work_size_Copy, NULL, 0, NULL, NULL); #endif }
void greentea_col2im_ndsk_gpu(viennacl::ocl::program *prog, viennacl::ocl::context *ctx, cl_mem data_col, const int_tp data_col_off, const int_tp num_spatial_axes, const int_tp im_size, cl_mem im_shape, cl_mem col_shape, cl_mem kernel_shape, cl_mem pad, cl_mem stride, cl_mem kstride, cl_mem data_im, int_tp data_off) { viennacl::ocl::kernel &kernel = prog->get_kernel( CL_KERNEL_SELECT("col2im_ndsk")); viennacl::ocl::enqueue( kernel(im_size, num_spatial_axes, WrapHandle(data_col, ctx), data_col_off, WrapHandle(im_shape, ctx), WrapHandle(col_shape, ctx), WrapHandle(kernel_shape, ctx), WrapHandle(pad, ctx), WrapHandle(stride, ctx), WrapHandle(kstride, ctx), WrapHandle(data_im, ctx), data_off), ctx->get_queue()); }
cl_int ConvolutionLayerSpatial<float>::convolve( const vector<Blob<float>*>& bottom, const vector<Blob<float>*>& top, int_tp index, int_tp numImages, kernelConfig* config) { viennacl::ocl::context &ctx = viennacl::ocl::get_context(this->device_->id()); viennacl::ocl::program & program = ctx.get_program(config->kernelName); viennacl::ocl::kernel &kernel = program.get_kernel(config->kernelName); cl_int err = 0; if (config->kernelType != 2) { for (int_tp n = 0; n < numImages; ++n) { for (int_tp g = 0; g < group_; ++g) { bias_offset_ = M_ * g; int_tp image_offset = n * this->bottom_dim_ + width_ * height_ * (channels_ / group_) * g; int_tp output_image_offset = n * this->top_dim_ + output_w_ * output_h_ * M_ * g; cl_uint argIdx = 0; int_tp kernel_offset = kernel_h_ * kernel_w_ * (channels_ / group_) * M_ * g; // Copy image if (pad_w_ > 0 || pad_h_ > 0) { pad_image(bottom, top, image_offset, config, numImages); image_offset = 0; kernel.arg(argIdx++, WrapHandle((cl_mem) col_data, &ctx)); } else { kernel.arg(argIdx++, WrapHandle((cl_mem) bottom_data, &ctx)); } kernel.arg(argIdx++, image_offset); kernel.arg(argIdx++, WrapHandle((cl_mem) weight, &ctx)); kernel.arg(argIdx++, kernel_offset); kernel.arg(argIdx++, WrapHandle((cl_mem) bias_, &ctx)); kernel.arg(argIdx++, bias_offset_); kernel.arg(argIdx++, WrapHandle((cl_mem) top_data, &ctx)); kernel.arg(argIdx++, output_image_offset); kernel.arg(argIdx++, (uint16_t)padded_width_); kernel.arg(argIdx++, (uint16_t)padded_height_); kernel.arg(argIdx++, (uint16_t)output_w_); kernel.arg(argIdx++, (uint16_t)output_h_); if (config->use_null_local) { err = clEnqueueNDRangeKernel(ctx.get_queue().handle().get(), kernel.handle().get(), 3, NULL, config->global_work_size, NULL, 0, NULL, NULL); } else { err = clEnqueueNDRangeKernel(ctx.get_queue().handle().get(), kernel.handle().get(), 3, NULL, config->global_work_size, config->local_work_size, 0, NULL, NULL); } if (err != CL_SUCCESS) return err; viennacl::backend::finish(); } } } else { swizzleWeights(bottom, top, 16); size_t total_bottom_size = bottom_dim_ * numImages; size_t total_kernel_size = kernel_h_ * kernel_w_ * channels_ * M_; size_t total_bias_size = M_ * group_; size_t total_top_size = top_dim_ * numImages; for (int_tp g = 0; g < group_; ++g) { bias_offset_ = M_ * g; int_tp image_offset = width_ * height_ * (channels_ / group_) * g; int_tp output_image_offset = output_w_ * output_h_ * M_ * g; cl_uint argIdx = 0; int_tp kernel_offset = kernel_h_ * kernel_w_ * (channels_ / group_) * M_ * g; // Copy image cl_mem input_image; if (pad_w_ > 0 || pad_h_ > 0) { pad_image(bottom, top, image_offset, config, numImages); image_offset = 0; input_image = (cl_mem) col_data; } else { input_image = (cl_mem) bottom_data; } setBufferKernelArg(bottom, top, &kernel, argIdx++, &ctx, input_image, image_offset, total_bottom_size - image_offset, true, false); setBufferKernelArg(bottom, top, &kernel, argIdx++, &ctx, (cl_mem) swizzled_weights, kernel_offset, total_kernel_size - kernel_offset, true, true); setBufferKernelArg(bottom, top, &kernel, argIdx++, &ctx, (cl_mem) bias_, bias_offset_, total_bias_size - bias_offset_, true, true); setBufferKernelArg(bottom, top, &kernel, argIdx++, &ctx, (cl_mem) top_data, output_image_offset, total_top_size - output_image_offset, false, false); kernel.arg(argIdx++, (uint16_t)padded_width_); kernel.arg(argIdx++, (uint16_t)padded_height_); kernel.arg(argIdx++, (uint16_t)output_w_); kernel.arg(argIdx++, (uint16_t)output_h_); err = clEnqueueNDRangeKernel(ctx.get_queue().handle().get(), kernel.handle().get(), 3, NULL, config->global_work_size, config->local_work_size, 0, NULL, NULL); if (err != CL_SUCCESS) return err; viennacl::backend::finish(); } if (group_ > 1) { viennacl::backend::finish(); cleanTmpSubBuffers(bottom, top); } } return err; }