inline kernel_call bluestein_pad_kernel(const cl::CommandQueue &queue, size_t n, size_t m, const cl::Buffer &in, const cl::Buffer &out) { std::ostringstream o; kernel_common<T>(o, qdev(queue)); o << "real2_t conj(real2_t v) {\n" << " return (real2_t)(v.x, -v.y);\n" << "}\n"; o << "__kernel void bluestein_pad_kernel(" << "__global real2_t *input, __global real2_t *output, uint n, uint m) {\n" << " const size_t x = get_global_id(0);\n" << " if(x < n || m - x < n)\n" << " output[x] = conj(input[min(x, m - x)]);\n" << " else\n" << " output[x] = (real2_t)(0,0);\n" << "}\n"; auto program = build_sources(qctx(queue), o.str()); cl::Kernel kernel(program, "bluestein_pad_kernel"); kernel.setArg(0, in); kernel.setArg(1, out); kernel.setArg(2, static_cast<cl_uint>(n)); kernel.setArg(3, static_cast<cl_uint>(m)); std::ostringstream desc; desc << "bluestein_pad_kernel{n=" << n << ", m=" << m << "}"; return kernel_call(true, desc.str(), program, kernel, cl::NDRange(m), cl::NullRange); }
inline kernel_call bluestein_mul(const cl::CommandQueue &queue, size_t n, size_t batch, const cl::Buffer &data, const cl::Buffer &exp, const cl::Buffer &out) { std::ostringstream o; kernel_common<T>(o, qdev(queue)); mul_code(o, false); o << "__kernel void bluestein_mul(" << "__global const real2_t *data, __global const real2_t *exp, __global real2_t *output, uint stride) {\n" << " const size_t x = get_global_id(0), y = get_global_id(1);\n" << " if(x < stride) {\n" << " const size_t off = x + stride * y;" << " output[off] = mul(data[off], exp[x]);\n" << " }\n" << "}\n"; auto program = build_sources(qctx(queue), o.str()); cl::Kernel kernel(program, "bluestein_mul"); kernel.setArg(0, data); kernel.setArg(1, exp); kernel.setArg(2, out); kernel.setArg(3, static_cast<cl_uint>(n)); const size_t wg = kernel.getWorkGroupInfo<CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE>(qdev(queue)); const size_t threads = alignup(n, wg); std::ostringstream desc; desc << "bluestein_mul{n=" << n << "(" << threads << "), wg=" << wg << ", batch=" << batch << "}"; return kernel_call(false, desc.str(), program, kernel, cl::NDRange(threads, batch), cl::NDRange(wg, 1)); }
inline kernel_call radix_kernel(bool once, const cl::CommandQueue &queue, size_t n, size_t batch, bool invert, pow radix, size_t p, const cl::Buffer &in, const cl::Buffer &out) { std::ostringstream o; o << std::setprecision(25); const auto device = qdev(queue); kernel_common<T>(o, device); mul_code(o, invert); twiddle_code<T>(o); const size_t m = n / radix.value; kernel_radix<T>(o, radix, invert); auto program = build_sources(qctx(queue), o.str(), "-cl-mad-enable -cl-fast-relaxed-math"); cl::Kernel kernel(program, "radix"); kernel.setArg(0, in); kernel.setArg(1, out); kernel.setArg(2, static_cast<cl_uint>(p)); kernel.setArg(3, static_cast<cl_uint>(m)); const size_t wg_mul = kernel.getWorkGroupInfo<CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE>(device); //const size_t max_cu = device.getInfo<CL_DEVICE_MAX_COMPUTE_UNITS>(); //const size_t max_wg = device.getInfo<CL_DEVICE_MAX_WORK_GROUP_SIZE>(); size_t wg = wg_mul; //while(wg * max_cu < max_wg) wg += wg_mul; //wg -= wg_mul; const size_t threads = alignup(m, wg); std::ostringstream desc; desc << "dft{r=" << radix << ", p=" << p << ", n=" << n << ", batch=" << batch << ", threads=" << m << "(" << threads << "), wg=" << wg << "}"; return kernel_call(once, desc.str(), program, kernel, cl::NDRange(threads, batch), cl::NDRange(wg, 1)); }
inline kernel_call transpose_kernel(const cl::CommandQueue &queue, size_t width, size_t height, const cl::Buffer &in, const cl::Buffer &out) { std::ostringstream o; const auto dev = qdev(queue); kernel_common<T>(o, dev); // determine max block size to fit into local memory/workgroup size_t block_size = 128; { const auto local_size = dev.getInfo<CL_DEVICE_LOCAL_MEM_SIZE>(); const auto workgroup = dev.getInfo<CL_DEVICE_MAX_WORK_GROUP_SIZE>(); while(block_size * block_size * sizeof(T) * 2 > local_size) block_size /= 2; while(block_size * block_size > workgroup) block_size /= 2; } // from NVIDIA SDK. o << "__kernel void transpose(" << "__global const real2_t *input, __global real2_t *output, uint width, uint height) {\n" << " const size_t " << " global_x = get_global_id(0), global_y = get_global_id(1),\n" << " local_x = get_local_id(0), local_y = get_local_id(1),\n" << " group_x = get_group_id(0), group_y = get_group_id(1),\n" << " block_size = " << block_size << ",\n" << " target_x = local_y + group_y * block_size,\n" << " target_y = local_x + group_x * block_size;\n" << " const bool range = global_x < width && global_y < height;\n" // local memory << " __local real2_t block[" << (block_size * block_size) << "];\n" // copy from input to local memory << " if(range)\n" << " block[local_x + local_y * block_size] = input[global_x + global_y * width];\n" // wait until the whole block is filled << " barrier(CLK_LOCAL_MEM_FENCE);\n" // transpose local block to target << " if(range)\n" << " output[target_x + target_y * height] = block[local_x + local_y * block_size];\n" << "}\n"; auto program = build_sources(qctx(queue), o.str()); cl::Kernel kernel(program, "transpose"); kernel.setArg(0, in); kernel.setArg(1, out); kernel.setArg(2, static_cast<cl_uint>(width)); kernel.setArg(3, static_cast<cl_uint>(height)); // range multiple of wg size, last block maybe not completely filled. size_t r_w = alignup(width, block_size); size_t r_h = alignup(height, block_size); std::ostringstream desc; desc << "transpose{" << "w=" << width << "(" << r_w << "), " << "h=" << height << "(" << r_h << "), " << "bs=" << block_size << "}"; return kernel_call(false, desc.str(), program, kernel, cl::NDRange(r_w, r_h), cl::NDRange(block_size, block_size)); }
inline kernel_call bluestein_mul_in(const cl::CommandQueue &queue, bool inverse, size_t batch, size_t radix, size_t p, size_t threads, size_t stride, const cl::Buffer &data, const cl::Buffer &exp, const cl::Buffer &out) { std::ostringstream o; kernel_common<T>(o, qdev(queue)); mul_code(o, false); twiddle_code<T>(o); o << "__kernel void bluestein_mul_in(" << "__global const real2_t *data, __global const real2_t *exp, __global real2_t *output, " << "uint radix, uint p, uint out_stride) {\n" << " const size_t\n" << " thread = get_global_id(0), threads = get_global_size(0),\n" << " batch = get_global_id(1),\n" << " element = get_global_id(2);\n" << " if(element < out_stride) {\n" << " const size_t\n" << " in_off = thread + batch * radix * threads + element * threads,\n" << " out_off = thread * out_stride + batch * out_stride * threads + element;\n" << " if(element < radix) {\n" << " real2_t w = exp[element];" << " if(p != 1) {\n" << " const int sign = " << (inverse ? "+1" : "-1") << ";\n" << " ulong a = (ulong)element * (thread % p);\n" << " ulong b = (ulong)radix * p;\n" << " real2_t t = twiddle(2 * sign * M_PI * (a % (2 * b)) / b);\n" << " w = mul(w, t);\n" << " }\n" << " output[out_off] = mul(data[in_off], w);\n" << " } else\n" << " output[out_off] = (real2_t)(0,0);" << " }\n" << "}\n"; auto program = build_sources(qctx(queue), o.str()); cl::Kernel kernel(program, "bluestein_mul_in"); kernel.setArg(0, data); kernel.setArg(1, exp); kernel.setArg(2, out); kernel.setArg(3, static_cast<cl_uint>(radix)); kernel.setArg(4, static_cast<cl_uint>(p)); kernel.setArg(5, static_cast<cl_uint>(stride)); const size_t wg = kernel.getWorkGroupInfo<CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE>(qdev(queue)); const size_t stride_pad = alignup(stride, wg); std::ostringstream desc; desc << "bluestein_mul_in{batch=" << batch << ", radix=" << radix << ", p=" << p << ", threads=" << threads << ", stride=" << stride << "(" << stride_pad << "), wg=" << wg << "}"; return kernel_call(false, desc.str(), program, kernel, cl::NDRange(threads, batch, stride_pad), cl::NDRange(1, 1, wg)); }
inline kernel_call bluestein_twiddle(const cl::CommandQueue &queue, size_t n, bool inverse, const cl::Buffer &out) { std::ostringstream o; kernel_common<T>(o, qdev(queue)); twiddle_code<T>(o); o << "__kernel void bluestein_twiddle(__global real2_t *output) {\n" << " const size_t x = get_global_id(0), n = get_global_size(0);\n" << " const int sign = " << (inverse ? "+1" : "-1") << ";\n" << " const size_t xx = ((ulong)x * x) % (2 * n);\n" << " output[x] = twiddle(sign * M_PI * xx / n);\n" << "}\n"; auto program = build_sources(qctx(queue), o.str()); cl::Kernel kernel(program, "bluestein_twiddle"); kernel.setArg(0, out); std::ostringstream desc; desc << "bluestein_twiddle{n=" << n << ", inverse=" << inverse << "}"; return kernel_call(true, desc.str(), program, kernel, cl::NDRange(n), cl::NullRange); }
inline kernel_call bluestein_mul_out(const cl::CommandQueue &queue, size_t batch, size_t p, size_t radix, size_t threads, size_t stride, const cl::Buffer &data, const cl::Buffer &exp, const cl::Buffer &out) { std::ostringstream o; kernel_common<T>(o, qdev(queue)); mul_code(o, false); o << "__kernel void bluestein_mul_out(" << "__global const real2_t *data, __global const real2_t *exp, __global real2_t *output, " << "real_t div, uint p, uint in_stride, uint radix) {\n" << " const size_t\n" << " i = get_global_id(0), threads = get_global_size(0),\n" << " b = get_global_id(1),\n" << " l = get_global_id(2);\n" << " if(l < radix) {\n" << " const size_t\n" << " k = i % p,\n" << " j = k + (i - k) * radix,\n" << " in_off = i * in_stride + b * in_stride * threads + l,\n" << " out_off = j + b * threads * radix + l * p;\n" << " output[out_off] = mul(data[in_off] * div, exp[l]);\n" << " }\n" << "}\n"; auto program = build_sources(qctx(queue), o.str()); cl::Kernel kernel(program, "bluestein_mul_out"); kernel.setArg(0, data); kernel.setArg(1, exp); kernel.setArg(2, out); kernel.setArg<T>(3, static_cast<T>(1) / stride); kernel.setArg(4, static_cast<cl_uint>(p)); kernel.setArg(5, static_cast<cl_uint>(stride)); kernel.setArg(6, static_cast<cl_uint>(radix)); const size_t wg = kernel.getWorkGroupInfo<CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE>(qdev(queue)); const size_t radix_pad = alignup(radix, wg); std::ostringstream desc; desc << "bluestein_mul_out{r=" << radix << "(" << radix_pad << "), wg=" << wg << ", batch=" << batch << ", p=" << p << ", thr=" << threads << ", stride=" << stride << "}"; return kernel_call(false, desc.str(), program, kernel, cl::NDRange(threads, batch, radix_pad), cl::NDRange(1, 1, wg)); }
typename std::enable_if< !boost::proto::matches< typename boost::proto::result_of::as_expr<ExprTuple>::type, multivector_full_grammar >::value #if !defined(_MSC_VER) || _MSC_VER >= 1700 && N == std::tuple_size<ExprTuple>::value #endif , const multivector& >::type operator=(const ExprTuple &expr) { #endif static kernel_cache cache; const std::vector<cl::CommandQueue> &queue = vec[0]->queue_list(); for(uint d = 0; d < queue.size(); d++) { cl::Context context = qctx(queue[d]); cl::Device device = qdev(queue[d]); auto kernel = cache.find( context() ); if (kernel == cache.end()) { std::ostringstream source; source << standard_kernel_header(device); { get_header f(source); for_each<0>(expr, f); } source << "kernel void multi_expr_tuple(\n" "\t" << type_name<size_t>() << " n"; for(uint i = 1; i <= N; i++) source << ",\n\tglobal " << type_name<T>() << " *res_" << i; { get_params f(source); for_each<0>(expr, f); } source << "\n)\n{\n"; if ( is_cpu(device) ) { source << "\tsize_t chunk_size = (n + get_global_size(0) - 1) / get_global_size(0);\n" "\tsize_t chunk_start = get_global_id(0) * chunk_size;\n" "\tsize_t chunk_end = min(n, chunk_start + chunk_size);\n" "\tfor(size_t idx = chunk_start; idx < chunk_end; ++idx) {\n"; } else { source << "\tfor(size_t idx = get_global_id(0); idx < n; idx += get_global_size(0)) {\n"; } { get_expressions f(source); for_each<0>(expr, f); } source << "\n"; for(uint i = 1; i <= N; i++) source << "\t\tres_" << i << "[idx] = buf_" << i << ";\n"; source << "\t}\n}\n"; auto program = build_sources(context, source.str()); cl::Kernel krn(program, "multi_expr_tuple"); size_t wgs = kernel_workgroup_size(krn, device); kernel = cache.insert(std::make_pair( context(), kernel_cache_entry(krn, wgs) )).first; } if (size_t psize = vec[0]->part_size(d)) { size_t w_size = kernel->second.wgsize; size_t g_size = num_workgroups(device) * w_size; uint pos = 0; kernel->second.kernel.setArg(pos++, psize); for(uint i = 0; i < N; i++) kernel->second.kernel.setArg(pos++, (*vec[i])(d)); { set_arguments f(kernel->second.kernel, d, pos, vec[0]->part_start(d)); for_each<0>(expr, f); } queue[d].enqueueNDRangeKernel( kernel->second.kernel, cl::NullRange, g_size, w_size ); } } return *this; }
typename std::enable_if< boost::proto::matches< typename boost::proto::result_of::as_expr<Expr>::type, multivector_expr_grammar >::value, const multivector& >::type operator=(const Expr& expr) { static kernel_cache cache; const std::vector<cl::CommandQueue> &queue = vec[0]->queue_list(); // If any device in context is CPU, then do not fuse the kernel, // but assign components individually. if (std::any_of(queue.begin(), queue.end(), [](const cl::CommandQueue &q) { return is_cpu(qdev(q)); })) { assign_subexpressions<0, N>(boost::proto::as_child(expr)); return *this; } for(uint d = 0; d < queue.size(); d++) { cl::Context context = qctx(queue[d]); cl::Device device = qdev(queue[d]); auto kernel = cache.find( context() ); if (kernel == cache.end()) { std::ostringstream kernel_name; kernel_name << "multi_"; vector_name_context name_ctx(kernel_name); boost::proto::eval(boost::proto::as_child(expr), name_ctx); std::ostringstream source; source << standard_kernel_header(device); extract_user_functions()( boost::proto::as_child(expr), declare_user_function(source) ); source << "kernel void " << kernel_name.str() << "(\n\t" << type_name<size_t>() << " n"; for(size_t i = 0; i < N; ) source << ",\n\tglobal " << type_name<T>() << " *res_" << ++i; build_param_list<N>(boost::proto::as_child(expr), source); source << "\n)\n{\n" "\tfor(size_t idx = get_global_id(0); idx < n; idx += get_global_size(0)) {\n"; build_expr_list(boost::proto::as_child(expr), source); source << "\t}\n}\n"; auto program = build_sources(context, source.str()); cl::Kernel krn(program, kernel_name.str().c_str()); size_t wgs = kernel_workgroup_size(krn, device); kernel = cache.insert(std::make_pair( context(), kernel_cache_entry(krn, wgs) )).first; } if (size_t psize = vec[0]->part_size(d)) { size_t w_size = kernel->second.wgsize; size_t g_size = num_workgroups(device) * w_size; uint pos = 0; kernel->second.kernel.setArg(pos++, psize); for(uint i = 0; i < N; i++) kernel->second.kernel.setArg(pos++, vec[i]->operator()(d)); set_kernel_args<N>( boost::proto::as_child(expr), kernel->second.kernel, d, pos, vec[0]->part_start(d) ); queue[d].enqueueNDRangeKernel( kernel->second.kernel, cl::NullRange, g_size, w_size ); } } return *this; }