inline kernel_call transpose_kernel( const backend::command_queue &queue, size_t width, size_t height, const backend::device_vector<T2> &in, const backend::device_vector<T2> &out ) { backend::source_generator o; kernel_common<T>(o, queue); // determine max block size to fit into local memory/workgroup size_t block_size = 128; { #ifndef VEXCL_BACKEND_CUDA cl_device_id dev = backend::get_device_id(queue); cl_ulong local_size; size_t workgroup; clGetDeviceInfo(dev, CL_DEVICE_LOCAL_MEM_SIZE, sizeof(cl_ulong), &local_size, NULL); clGetDeviceInfo(dev, CL_DEVICE_MAX_WORK_GROUP_SIZE, sizeof(size_t), &workgroup, NULL); #else const auto local_size = queue.device().max_shared_memory_per_block(); const auto workgroup = queue.device().max_threads_per_block(); #endif while(block_size * block_size * sizeof(T) * 2 > local_size) block_size /= 2; while(block_size * block_size > workgroup) block_size /= 2; } // from NVIDIA SDK. o.kernel("transpose").open("(") .template parameter< global_ptr<const T2> >("input") .template parameter< global_ptr< T2> >("output") .template parameter< cl_uint >("width") .template parameter< cl_uint >("height") .close(")").open("{"); o.new_line() << "const size_t global_x = " << o.global_id(0) << ";"; o.new_line() << "const size_t global_y = " << o.global_id(1) << ";"; o.new_line() << "const size_t local_x = " << o.local_id(0) << ";"; o.new_line() << "const size_t local_y = " << o.local_id(1) << ";"; o.new_line() << "const size_t group_x = " << o.group_id(0) << ";"; o.new_line() << "const size_t group_y = " << o.group_id(1) << ";"; o.new_line() << "const size_t target_x = local_y + group_y * " << block_size << ";"; o.new_line() << "const size_t target_y = local_x + group_x * " << block_size << ";"; o.new_line() << "const bool range = global_x < width && global_y < height;"; // local memory { std::ostringstream s; s << "block[" << block_size * block_size << "]"; o.smem_static_var(type_name<T2>(), s.str()); } // copy from input to local memory o.new_line() << "if(range) " << "block[local_x + local_y * " << block_size << "] = input[global_x + global_y * width];"; // wait until the whole block is filled o.new_line().barrier(); // transpose local block to target o.new_line() << "if(range) " << "output[target_x + target_y * height] = block[local_x + local_y * " << block_size << "];"; o.close("}"); backend::kernel kernel(queue, o.str(), "transpose"); kernel.push_arg(in); kernel.push_arg(out); kernel.push_arg(static_cast<cl_uint>(width)); kernel.push_arg(static_cast<cl_uint>(height)); // range multiple of wg size, last block maybe not completely filled. size_t r_w = (width + block_size - 1) / block_size; size_t r_h = (height + block_size - 1) / block_size; kernel.config(backend::ndrange(r_w, r_h), backend::ndrange(block_size, block_size)); std::ostringstream desc; desc << "transpose{" << "w=" << width << "(" << r_w << "), " << "h=" << height << "(" << r_h << "), " << "bs=" << block_size << "}"; return kernel_call(false, desc.str(), kernel); }
size_t preferred_work_group_size_multiple(const backend::command_queue &q) const { return q.device().warp_size(); }