inline kernel_call bluestein_pad_kernel(const cl::CommandQueue &queue, size_t n, size_t m, const cl::Buffer &in, const cl::Buffer &out) { std::ostringstream o; kernel_common<T>(o, qdev(queue)); o << "real2_t conj(real2_t v) {\n" << " return (real2_t)(v.x, -v.y);\n" << "}\n"; o << "__kernel void bluestein_pad_kernel(" << "__global real2_t *input, __global real2_t *output, uint n, uint m) {\n" << " const size_t x = get_global_id(0);\n" << " if(x < n || m - x < n)\n" << " output[x] = conj(input[min(x, m - x)]);\n" << " else\n" << " output[x] = (real2_t)(0,0);\n" << "}\n"; auto program = build_sources(qctx(queue), o.str()); cl::Kernel kernel(program, "bluestein_pad_kernel"); kernel.setArg(0, in); kernel.setArg(1, out); kernel.setArg(2, static_cast<cl_uint>(n)); kernel.setArg(3, static_cast<cl_uint>(m)); std::ostringstream desc; desc << "bluestein_pad_kernel{n=" << n << ", m=" << m << "}"; return kernel_call(true, desc.str(), program, kernel, cl::NDRange(m), cl::NullRange); }
inline kernel_call bluestein_mul(const cl::CommandQueue &queue, size_t n, size_t batch, const cl::Buffer &data, const cl::Buffer &exp, const cl::Buffer &out) { std::ostringstream o; kernel_common<T>(o, qdev(queue)); mul_code(o, false); o << "__kernel void bluestein_mul(" << "__global const real2_t *data, __global const real2_t *exp, __global real2_t *output, uint stride) {\n" << " const size_t x = get_global_id(0), y = get_global_id(1);\n" << " if(x < stride) {\n" << " const size_t off = x + stride * y;" << " output[off] = mul(data[off], exp[x]);\n" << " }\n" << "}\n"; auto program = build_sources(qctx(queue), o.str()); cl::Kernel kernel(program, "bluestein_mul"); kernel.setArg(0, data); kernel.setArg(1, exp); kernel.setArg(2, out); kernel.setArg(3, static_cast<cl_uint>(n)); const size_t wg = kernel.getWorkGroupInfo<CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE>(qdev(queue)); const size_t threads = alignup(n, wg); std::ostringstream desc; desc << "bluestein_mul{n=" << n << "(" << threads << "), wg=" << wg << ", batch=" << batch << "}"; return kernel_call(false, desc.str(), program, kernel, cl::NDRange(threads, batch), cl::NDRange(wg, 1)); }
inline kernel_call radix_kernel(bool once, const cl::CommandQueue &queue, size_t n, size_t batch, bool invert, pow radix, size_t p, const cl::Buffer &in, const cl::Buffer &out) { std::ostringstream o; o << std::setprecision(25); const auto device = qdev(queue); kernel_common<T>(o, device); mul_code(o, invert); twiddle_code<T>(o); const size_t m = n / radix.value; kernel_radix<T>(o, radix, invert); auto program = build_sources(qctx(queue), o.str(), "-cl-mad-enable -cl-fast-relaxed-math"); cl::Kernel kernel(program, "radix"); kernel.setArg(0, in); kernel.setArg(1, out); kernel.setArg(2, static_cast<cl_uint>(p)); kernel.setArg(3, static_cast<cl_uint>(m)); const size_t wg_mul = kernel.getWorkGroupInfo<CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE>(device); //const size_t max_cu = device.getInfo<CL_DEVICE_MAX_COMPUTE_UNITS>(); //const size_t max_wg = device.getInfo<CL_DEVICE_MAX_WORK_GROUP_SIZE>(); size_t wg = wg_mul; //while(wg * max_cu < max_wg) wg += wg_mul; //wg -= wg_mul; const size_t threads = alignup(m, wg); std::ostringstream desc; desc << "dft{r=" << radix << ", p=" << p << ", n=" << n << ", batch=" << batch << ", threads=" << m << "(" << threads << "), wg=" << wg << "}"; return kernel_call(once, desc.str(), program, kernel, cl::NDRange(threads, batch), cl::NDRange(wg, 1)); }
inline kernel_call transpose_kernel(const cl::CommandQueue &queue, size_t width, size_t height, const cl::Buffer &in, const cl::Buffer &out) { std::ostringstream o; const auto dev = qdev(queue); kernel_common<T>(o, dev); // determine max block size to fit into local memory/workgroup size_t block_size = 128; { const auto local_size = dev.getInfo<CL_DEVICE_LOCAL_MEM_SIZE>(); const auto workgroup = dev.getInfo<CL_DEVICE_MAX_WORK_GROUP_SIZE>(); while(block_size * block_size * sizeof(T) * 2 > local_size) block_size /= 2; while(block_size * block_size > workgroup) block_size /= 2; } // from NVIDIA SDK. o << "__kernel void transpose(" << "__global const real2_t *input, __global real2_t *output, uint width, uint height) {\n" << " const size_t " << " global_x = get_global_id(0), global_y = get_global_id(1),\n" << " local_x = get_local_id(0), local_y = get_local_id(1),\n" << " group_x = get_group_id(0), group_y = get_group_id(1),\n" << " block_size = " << block_size << ",\n" << " target_x = local_y + group_y * block_size,\n" << " target_y = local_x + group_x * block_size;\n" << " const bool range = global_x < width && global_y < height;\n" // local memory << " __local real2_t block[" << (block_size * block_size) << "];\n" // copy from input to local memory << " if(range)\n" << " block[local_x + local_y * block_size] = input[global_x + global_y * width];\n" // wait until the whole block is filled << " barrier(CLK_LOCAL_MEM_FENCE);\n" // transpose local block to target << " if(range)\n" << " output[target_x + target_y * height] = block[local_x + local_y * block_size];\n" << "}\n"; auto program = build_sources(qctx(queue), o.str()); cl::Kernel kernel(program, "transpose"); kernel.setArg(0, in); kernel.setArg(1, out); kernel.setArg(2, static_cast<cl_uint>(width)); kernel.setArg(3, static_cast<cl_uint>(height)); // range multiple of wg size, last block maybe not completely filled. size_t r_w = alignup(width, block_size); size_t r_h = alignup(height, block_size); std::ostringstream desc; desc << "transpose{" << "w=" << width << "(" << r_w << "), " << "h=" << height << "(" << r_h << "), " << "bs=" << block_size << "}"; return kernel_call(false, desc.str(), program, kernel, cl::NDRange(r_w, r_h), cl::NDRange(block_size, block_size)); }
inline kernel_call bluestein_pad_kernel( const backend::command_queue &queue, size_t n, size_t m, const backend::device_vector<T2> &in, const backend::device_vector<T2> &out ) { scoped_program_header header(queue, fft_kernel_header<T>()); backend::source_generator o(queue); o << std::setprecision(25); o.begin_function<T2>("conj"); o.begin_function_parameters(); o.template parameter<T2>("v"); o.end_function_parameters(); o.new_line() << type_name<T2>() << " r = {v.x, -v.y};"; o.new_line() << "return r;"; o.end_function(); o.begin_kernel("bluestein_pad_kernel"); o.begin_kernel_parameters(); o.template parameter< global_ptr<const T2> >("input"); o.template parameter< global_ptr< T2> >("output"); o.template parameter< cl_uint >("n"); o.template parameter< cl_uint >("m"); o.end_kernel_parameters(); o.new_line() << "const uint x = " << o.global_id(0) << ";"; o.new_line() << "if (x < m)"; o.open("{"); o.new_line() << "if(x < n || m - x < n)"; o.open("{"); o.new_line() << "output[x] = conj(input[min(x, m - x)]);"; o.close("}"); o.new_line() << "else"; o.open("{"); o.new_line() << type_name<T2>() << " r = {0,0};"; o.new_line() << "output[x] = r;"; o.close("}"); o.close("}"); o.end_kernel(); backend::kernel kernel(queue, o.str(), "bluestein_pad_kernel"); kernel.push_arg(in); kernel.push_arg(out); kernel.push_arg(static_cast<cl_uint>(n)); kernel.push_arg(static_cast<cl_uint>(m)); size_t ws = kernel.preferred_work_group_size_multiple(queue); size_t gs = (m + ws - 1) / ws; kernel.config(gs, ws); std::ostringstream desc; desc << "bluestein_pad_kernel{n=" << n << ", m=" << m << "}"; return kernel_call(true, desc.str(), kernel); }
inline kernel_call bluestein_mul( const backend::command_queue &queue, size_t n, size_t batch, const backend::device_vector<T2> &data, const backend::device_vector<T2> &exp, const backend::device_vector<T2> &out ) { scoped_program_header header(queue, fft_kernel_header<T>()); backend::source_generator o(queue); o << std::setprecision(25); mul_code<T2>(o, false); o.begin_kernel("bluestein_mul"); o.begin_kernel_parameters(); o.template parameter< global_ptr<const T2> >("data"); o.template parameter< global_ptr<const T2> >("exp"); o.template parameter< global_ptr< T2> >("output"); o.template parameter< cl_uint >("stride"); o.end_kernel_parameters(); o.new_line() << "const size_t x = " << o.global_id(0) << ";"; o.new_line() << "const size_t y = " << o.global_id(1) << ";"; o.new_line() << "if(x < stride)"; o.open("{"); o.new_line() << "const size_t off = x + stride * y;"; o.new_line() << "output[off] = mul(data[off], exp[x]);"; o.close("}"); o.end_kernel(); backend::kernel kernel(queue, o.str(), "bluestein_mul"); kernel.push_arg(data); kernel.push_arg(exp); kernel.push_arg(out); kernel.push_arg(static_cast<cl_uint>(n)); const size_t wg = kernel.preferred_work_group_size_multiple(queue); const size_t threads = (n + wg - 1) / wg; kernel.config(backend::ndrange(threads, batch), backend::ndrange(wg, 1)); std::ostringstream desc; desc << "bluestein_mul{n=" << n << "(" << threads << "), wg=" << wg << ", batch=" << batch << "}"; return kernel_call(false, desc.str(), kernel); }
inline kernel_call bluestein_mul_in(const cl::CommandQueue &queue, bool inverse, size_t batch, size_t radix, size_t p, size_t threads, size_t stride, const cl::Buffer &data, const cl::Buffer &exp, const cl::Buffer &out) { std::ostringstream o; kernel_common<T>(o, qdev(queue)); mul_code(o, false); twiddle_code<T>(o); o << "__kernel void bluestein_mul_in(" << "__global const real2_t *data, __global const real2_t *exp, __global real2_t *output, " << "uint radix, uint p, uint out_stride) {\n" << " const size_t\n" << " thread = get_global_id(0), threads = get_global_size(0),\n" << " batch = get_global_id(1),\n" << " element = get_global_id(2);\n" << " if(element < out_stride) {\n" << " const size_t\n" << " in_off = thread + batch * radix * threads + element * threads,\n" << " out_off = thread * out_stride + batch * out_stride * threads + element;\n" << " if(element < radix) {\n" << " real2_t w = exp[element];" << " if(p != 1) {\n" << " const int sign = " << (inverse ? "+1" : "-1") << ";\n" << " ulong a = (ulong)element * (thread % p);\n" << " ulong b = (ulong)radix * p;\n" << " real2_t t = twiddle(2 * sign * M_PI * (a % (2 * b)) / b);\n" << " w = mul(w, t);\n" << " }\n" << " output[out_off] = mul(data[in_off], w);\n" << " } else\n" << " output[out_off] = (real2_t)(0,0);" << " }\n" << "}\n"; auto program = build_sources(qctx(queue), o.str()); cl::Kernel kernel(program, "bluestein_mul_in"); kernel.setArg(0, data); kernel.setArg(1, exp); kernel.setArg(2, out); kernel.setArg(3, static_cast<cl_uint>(radix)); kernel.setArg(4, static_cast<cl_uint>(p)); kernel.setArg(5, static_cast<cl_uint>(stride)); const size_t wg = kernel.getWorkGroupInfo<CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE>(qdev(queue)); const size_t stride_pad = alignup(stride, wg); std::ostringstream desc; desc << "bluestein_mul_in{batch=" << batch << ", radix=" << radix << ", p=" << p << ", threads=" << threads << ", stride=" << stride << "(" << stride_pad << "), wg=" << wg << "}"; return kernel_call(false, desc.str(), program, kernel, cl::NDRange(threads, batch, stride_pad), cl::NDRange(1, 1, wg)); }
inline kernel_call radix_kernel( bool once, const backend::command_queue &queue, size_t n, size_t batch, bool invert, pow radix, size_t p, const backend::device_vector<T2> &in, const backend::device_vector<T2> &out ) { backend::source_generator o; o << std::setprecision(25); kernel_common<T>(o, queue); mul_code<T2>(o, invert); twiddle_code<T, T2>(o); const size_t m = n / radix.value; kernel_radix<T, T2>(o, radix, invert); backend::kernel kernel(queue, o.str(), "radix", 0, #ifndef VEXCL_BACKEND_CUDA "-cl-mad-enable -cl-fast-relaxed-math" #else "--use_fast_math" #endif ); kernel.push_arg(in); kernel.push_arg(out); kernel.push_arg(static_cast<cl_uint>(p)); kernel.push_arg(static_cast<cl_uint>(m)); const size_t wg_mul = kernel.preferred_work_group_size_multiple(queue); //const size_t max_cu = device.getInfo<CL_DEVICE_MAX_COMPUTE_UNITS>(); //const size_t max_wg = device.getInfo<CL_DEVICE_MAX_WORK_GROUP_SIZE>(); size_t wg = wg_mul; //while(wg * max_cu < max_wg) wg += wg_mul; //wg -= wg_mul; const size_t threads = (m + wg - 1) / wg; kernel.config(backend::ndrange(threads, batch), backend::ndrange(wg, 1)); std::ostringstream desc; desc << "dft{r=" << radix << ", p=" << p << ", n=" << n << ", batch=" << batch << ", threads=" << m << "(" << threads << "), wg=" << wg << "}"; return kernel_call(once, desc.str(), kernel); }
int main() { int width = 320; int height = 112; int *a = (int*)malloc(width*height*sizeof(int)); int pitch = sizeof(int)*width; auto acc = hc::accelerator(); int* a_d = (int*)hc::am_alloc(width*height*sizeof(int), acc, 0); grid_launch_parm lp; grid_launch_init(&lp); lp.grid_dim = gl_dim3(width/TILE_I, height/TILE_J); lp.group_dim = gl_dim3(TILE_I, TILE_J); hc::completion_future cf; lp.cf = &cf; kernel_call(lp, a_d, pitch); lp.cf->wait(); static hc::accelerator_view av = acc.get_default_view(); av.copy(a_d, a, width*height*sizeof(int)); int ret = 0; for(int i = 0; i < width*height; ++i) { if(a[i] != i) ret++; } if(ret != 0) { printf("errors: %d\n", ret); return 1; } hc::am_free(a_d); free(a); return 0; }
inline kernel_call bluestein_twiddle(const cl::CommandQueue &queue, size_t n, bool inverse, const cl::Buffer &out) { std::ostringstream o; kernel_common<T>(o, qdev(queue)); twiddle_code<T>(o); o << "__kernel void bluestein_twiddle(__global real2_t *output) {\n" << " const size_t x = get_global_id(0), n = get_global_size(0);\n" << " const int sign = " << (inverse ? "+1" : "-1") << ";\n" << " const size_t xx = ((ulong)x * x) % (2 * n);\n" << " output[x] = twiddle(sign * M_PI * xx / n);\n" << "}\n"; auto program = build_sources(qctx(queue), o.str()); cl::Kernel kernel(program, "bluestein_twiddle"); kernel.setArg(0, out); std::ostringstream desc; desc << "bluestein_twiddle{n=" << n << ", inverse=" << inverse << "}"; return kernel_call(true, desc.str(), program, kernel, cl::NDRange(n), cl::NullRange); }
inline kernel_call bluestein_twiddle( const backend::command_queue &queue, size_t n, bool inverse, const backend::device_vector<T2> &out ) { scoped_program_header header(queue, fft_kernel_header<T>()); backend::source_generator o(queue); o << std::setprecision(25); twiddle_code<T, T2>(o); o.begin_kernel("bluestein_twiddle"); o.begin_kernel_parameters(); o.template parameter< size_t >("n"); o.template parameter< global_ptr<T2> >("output"); o.end_kernel_parameters(); o.new_line() << "const size_t x = " << o.global_id(0) << ";"; o.new_line() << "const size_t xx = ((ulong)x * x) % (2 * n);"; o.new_line() << "if (x < n) output[x] = twiddle(" << std::setprecision(16) << (inverse ? 1 : -1) * boost::math::constants::pi<T>() << " * xx / n);"; o.end_kernel(); backend::kernel kernel(queue, o.str(), "bluestein_twiddle"); kernel.push_arg(n); kernel.push_arg(out); size_t ws = kernel.preferred_work_group_size_multiple(queue); size_t gs = (n + ws - 1) / ws; kernel.config(gs, ws); std::ostringstream desc; desc << "bluestein_twiddle{n=" << n << ", inverse=" << inverse << "}"; return kernel_call(true, desc.str(), kernel); }
inline kernel_call bluestein_mul_out(const cl::CommandQueue &queue, size_t batch, size_t p, size_t radix, size_t threads, size_t stride, const cl::Buffer &data, const cl::Buffer &exp, const cl::Buffer &out) { std::ostringstream o; kernel_common<T>(o, qdev(queue)); mul_code(o, false); o << "__kernel void bluestein_mul_out(" << "__global const real2_t *data, __global const real2_t *exp, __global real2_t *output, " << "real_t div, uint p, uint in_stride, uint radix) {\n" << " const size_t\n" << " i = get_global_id(0), threads = get_global_size(0),\n" << " b = get_global_id(1),\n" << " l = get_global_id(2);\n" << " if(l < radix) {\n" << " const size_t\n" << " k = i % p,\n" << " j = k + (i - k) * radix,\n" << " in_off = i * in_stride + b * in_stride * threads + l,\n" << " out_off = j + b * threads * radix + l * p;\n" << " output[out_off] = mul(data[in_off] * div, exp[l]);\n" << " }\n" << "}\n"; auto program = build_sources(qctx(queue), o.str()); cl::Kernel kernel(program, "bluestein_mul_out"); kernel.setArg(0, data); kernel.setArg(1, exp); kernel.setArg(2, out); kernel.setArg<T>(3, static_cast<T>(1) / stride); kernel.setArg(4, static_cast<cl_uint>(p)); kernel.setArg(5, static_cast<cl_uint>(stride)); kernel.setArg(6, static_cast<cl_uint>(radix)); const size_t wg = kernel.getWorkGroupInfo<CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE>(qdev(queue)); const size_t radix_pad = alignup(radix, wg); std::ostringstream desc; desc << "bluestein_mul_out{r=" << radix << "(" << radix_pad << "), wg=" << wg << ", batch=" << batch << ", p=" << p << ", thr=" << threads << ", stride=" << stride << "}"; return kernel_call(false, desc.str(), program, kernel, cl::NDRange(threads, batch, radix_pad), cl::NDRange(1, 1, wg)); }
inline kernel_call bluestein_mul_out( const backend::command_queue &queue, size_t batch, size_t p, size_t radix, size_t threads, size_t stride, const backend::device_vector<T2> &data, const backend::device_vector<T2> &exp, const backend::device_vector<T2> &out ) { backend::source_generator o; kernel_common<T>(o, queue); mul_code<T2>(o, false); o.function<T2>("scale").open("(") .template parameter<T2>("x") .template parameter<T >("a") .close(")").open("{"); o.new_line() << type_name<T2>() << " r = {x.x * a, x.y * a};"; o.new_line() << "return r;"; o.close("}"); o.kernel("bluestein_mul_out").open("(") .template parameter< global_ptr<const T2> >("data") .template parameter< global_ptr<const T2> >("exp") .template parameter< global_ptr< T2> >("output") .template parameter< T >("div") .template parameter< cl_uint >("p") .template parameter< cl_uint >("in_stride") .template parameter< cl_uint >("radix") .close(")").open("{"); o.new_line() << "const size_t i = " << o.global_id(0) << ";"; o.new_line() << "const size_t threads = " << o.global_size(0) << ";"; o.new_line() << "const size_t b = " << o.global_id(1) << ";"; o.new_line() << "const size_t l = " << o.global_id(2) << ";"; o.new_line() << "if(l < radix)"; o.open("{"); o.new_line() << "const size_t k = i % p;"; o.new_line() << "const size_t j = k + (i - k) * radix;"; o.new_line() << "const size_t in_off = i * in_stride + b * in_stride * threads + l;"; o.new_line() << "const size_t out_off = j + b * threads * radix + l * p;"; o.new_line() << "output[out_off] = mul(scale(data[in_off], div), exp[l]);"; o.close("}"); o.close("}"); backend::kernel kernel(queue, o.str(), "bluestein_mul_out"); kernel.push_arg(data); kernel.push_arg(exp); kernel.push_arg(out); kernel.push_arg(static_cast<T>(1.0 / stride)); kernel.push_arg(static_cast<cl_uint>(p)); kernel.push_arg(static_cast<cl_uint>(stride)); kernel.push_arg(static_cast<cl_uint>(radix)); const size_t wg = kernel.preferred_work_group_size_multiple(queue); const size_t radix_pad = (radix + wg - 1) / wg; kernel.config( backend::ndrange(threads, batch, radix_pad), backend::ndrange( 1, 1, wg) ); std::ostringstream desc; desc << "bluestein_mul_out{r=" << radix << "(" << radix_pad << "), wg=" << wg << ", batch=" << batch << ", p=" << p << ", thr=" << threads << ", stride=" << stride << "}"; return kernel_call(false, desc.str(), kernel); }
inline kernel_call bluestein_mul_in( const backend::command_queue &queue, bool inverse, size_t batch, size_t radix, size_t p, size_t threads, size_t stride, const backend::device_vector<T2> &data, const backend::device_vector<T2> &exp, const backend::device_vector<T2> &out ) { backend::source_generator o; kernel_common<T>(o, queue); mul_code<T2>(o, false); twiddle_code<T, T2>(o); o.kernel("bluestein_mul_in").open("(") .template parameter< global_ptr<const T2> >("data") .template parameter< global_ptr<const T2> >("exp") .template parameter< global_ptr< T2> >("output") .template parameter< cl_uint >("radix") .template parameter< cl_uint >("p") .template parameter< cl_uint >("out_stride") .close(")").open("{"); o.new_line() << "const size_t thread = " << o.global_id(0) << ";"; o.new_line() << "const size_t threads = " << o.global_size(0) << ";"; o.new_line() << "const size_t batch = " << o.global_id(1) << ";"; o.new_line() << "const size_t element = " << o.global_id(2) << ";"; o.new_line() << "if(element < out_stride)"; o.open("{"); o.new_line() << "const size_t in_off = thread + batch * radix * threads + element * threads;"; o.new_line() << "const size_t out_off = thread * out_stride + batch * out_stride * threads + element;"; o.new_line() << "if(element < radix)"; o.open("{"); o.new_line() << type_name<T2>() << " w = exp[element];"; o.new_line() << "if(p != 1)"; o.open("{"); o.new_line() << "ulong a = (ulong)element * (thread % p);"; o.new_line() << "ulong b = (ulong)radix * p;"; o.new_line() << type_name<T2>() << " t = twiddle(" << std::setprecision(16) << (inverse ? 1 : -1) * boost::math::constants::two_pi<T>() << " * (a % (2 * b)) / b);"; o.new_line() << "w = mul(w, t);"; o.close("}"); o.new_line() << "output[out_off] = mul(data[in_off], w);"; o.close("}"); o.new_line() << "else"; o.open("{"); o.new_line() << type_name<T2>() << " r = {0,0};"; o.new_line() << "output[out_off] = r;"; o.close("}"); o.close("}"); o.close("}"); backend::kernel kernel(queue, o.str(), "bluestein_mul_in"); kernel.push_arg(data); kernel.push_arg(exp); kernel.push_arg(out); kernel.push_arg(static_cast<cl_uint>(radix)); kernel.push_arg(static_cast<cl_uint>(p)); kernel.push_arg(static_cast<cl_uint>(stride)); const size_t wg = kernel.preferred_work_group_size_multiple(queue); const size_t stride_pad = (stride + wg - 1) / wg; kernel.config( backend::ndrange(threads, batch, stride_pad), backend::ndrange( 1, 1, wg) ); std::ostringstream desc; desc << "bluestein_mul_in{batch=" << batch << ", radix=" << radix << ", p=" << p << ", threads=" << threads << ", stride=" << stride << "(" << stride_pad << "), wg=" << wg << "}"; return kernel_call(false, desc.str(), kernel); }
inline kernel_call transpose_kernel( const backend::command_queue &queue, size_t width, size_t height, const backend::device_vector<T2> &in, const backend::device_vector<T2> &out ) { backend::source_generator o; kernel_common<T>(o, queue); // determine max block size to fit into local memory/workgroup size_t block_size = 128; { #ifndef VEXCL_BACKEND_CUDA cl_device_id dev = backend::get_device_id(queue); cl_ulong local_size; size_t workgroup; clGetDeviceInfo(dev, CL_DEVICE_LOCAL_MEM_SIZE, sizeof(cl_ulong), &local_size, NULL); clGetDeviceInfo(dev, CL_DEVICE_MAX_WORK_GROUP_SIZE, sizeof(size_t), &workgroup, NULL); #else const auto local_size = queue.device().max_shared_memory_per_block(); const auto workgroup = queue.device().max_threads_per_block(); #endif while(block_size * block_size * sizeof(T) * 2 > local_size) block_size /= 2; while(block_size * block_size > workgroup) block_size /= 2; } // from NVIDIA SDK. o.kernel("transpose").open("(") .template parameter< global_ptr<const T2> >("input") .template parameter< global_ptr< T2> >("output") .template parameter< cl_uint >("width") .template parameter< cl_uint >("height") .close(")").open("{"); o.new_line() << "const size_t global_x = " << o.global_id(0) << ";"; o.new_line() << "const size_t global_y = " << o.global_id(1) << ";"; o.new_line() << "const size_t local_x = " << o.local_id(0) << ";"; o.new_line() << "const size_t local_y = " << o.local_id(1) << ";"; o.new_line() << "const size_t group_x = " << o.group_id(0) << ";"; o.new_line() << "const size_t group_y = " << o.group_id(1) << ";"; o.new_line() << "const size_t target_x = local_y + group_y * " << block_size << ";"; o.new_line() << "const size_t target_y = local_x + group_x * " << block_size << ";"; o.new_line() << "const bool range = global_x < width && global_y < height;"; // local memory { std::ostringstream s; s << "block[" << block_size * block_size << "]"; o.smem_static_var(type_name<T2>(), s.str()); } // copy from input to local memory o.new_line() << "if(range) " << "block[local_x + local_y * " << block_size << "] = input[global_x + global_y * width];"; // wait until the whole block is filled o.new_line().barrier(); // transpose local block to target o.new_line() << "if(range) " << "output[target_x + target_y * height] = block[local_x + local_y * " << block_size << "];"; o.close("}"); backend::kernel kernel(queue, o.str(), "transpose"); kernel.push_arg(in); kernel.push_arg(out); kernel.push_arg(static_cast<cl_uint>(width)); kernel.push_arg(static_cast<cl_uint>(height)); // range multiple of wg size, last block maybe not completely filled. size_t r_w = (width + block_size - 1) / block_size; size_t r_h = (height + block_size - 1) / block_size; kernel.config(backend::ndrange(r_w, r_h), backend::ndrange(block_size, block_size)); std::ostringstream desc; desc << "transpose{" << "w=" << width << "(" << r_w << "), " << "h=" << height << "(" << r_h << "), " << "bs=" << block_size << "}"; return kernel_call(false, desc.str(), kernel); }