inline void kernel_radix(backend::source_generator &o, pow radix, bool invert) { o << in_place_dft(radix.value, invert); // kernel. o.begin_kernel("radix"); o.begin_kernel_parameters(); o.template parameter< global_ptr<const T2> >("x"); o.template parameter< global_ptr< T2> >("y"); o.template parameter< cl_uint >("p"); o.template parameter< cl_uint >("threads"); o.end_kernel_parameters(); o.new_line() << "const size_t i = " << o.global_id(0) << ";"; o.new_line() << "if(i >= threads) return;"; // index in input sequence, in 0..P-1 o.new_line() << "const size_t k = i % p;"; o.new_line() << "const size_t batch_offset = " << o.global_id(1) << " * threads * " << radix.value << ";"; // read o.new_line() << "x += i + batch_offset;"; for(size_t i = 0; i < radix.value; ++i) o.new_line() << type_name<T2>() << " v" << i << " = x[" << i << " * threads];"; // twiddle o.new_line() << "if(p != 1)"; o.open("{"); for(size_t i = 1; i < radix.value; ++i) { const T alpha = -boost::math::constants::two_pi<T>() * i / radix.value; o.new_line() << "v" << i << " = mul(v" << i << ", twiddle(" << "(" << type_name<T>() << ")" << std::setprecision(16) << alpha << " * k / p));"; } o.close("}"); // inplace DFT o.new_line() << "dft" << radix.value; param_list(o, "&", 0, radix.value); o << ";"; // write back o.new_line() << "const size_t j = k + (i - k) * " << radix.value << ";"; o.new_line() << "y += j + batch_offset;"; for(size_t i = 0; i < radix.value; i++) o.new_line() << "y[" << i << " * p] = v" << i << ";"; o.end_kernel(); }
inline void kernel_radix(std::ostringstream &o, pow radix, bool invert) { o << in_place_dft(radix.value, invert); // kernel. o << "__kernel void radix(__global const real2_t *x, __global real2_t *y, uint p, uint threads) {\n" << " const size_t i = get_global_id(0);\n" << " if(i >= threads) return;\n" // index in input sequence, in 0..P-1 << " const size_t k = i % p;\n" << " const size_t batch_offset = get_global_id(1) * threads * " << radix.value << ";\n"; // read o << " x += i + batch_offset;\n"; for(size_t i = 0 ; i < radix.value ; i++) o << " real2_t v" << i << " = x[" << i << " * threads];\n"; // twiddle o << " if(p != 1) {\n"; for(size_t i = 1 ; i < radix.value ; i++) { const T alpha = -boost::math::constants::two_pi<T>() * i / radix.value; o << " v" << i << " = mul(v" << i << ", twiddle(" << "(real_t)" << alpha << " * k / p));\n"; } o << " }\n"; // inplace DFT o << " dft" << radix.value; param_list(o, "&", 0, radix.value); o << ";\n"; // write back o << " const size_t j = k + (i - k) * " << radix.value << ";\n"; o << " y += j + batch_offset;\n"; for(size_t i = 0 ; i < radix.value ; i++) o << " y[" << i << " * p] = v" << i << ";\n"; o << "}\n"; }