示例#1
0
inline void kernel_radix(backend::source_generator &o, pow radix, bool invert) {
    o << in_place_dft(radix.value, invert);

    // kernel.
    o.begin_kernel("radix");
    o.begin_kernel_parameters();
    o.template parameter< global_ptr<const T2> >("x");
    o.template parameter< global_ptr<      T2> >("y");
    o.template parameter< cl_uint              >("p");
    o.template parameter< cl_uint              >("threads");
    o.end_kernel_parameters();

    o.new_line() << "const size_t i = " << o.global_id(0) << ";";
    o.new_line() << "if(i >= threads) return;";

    // index in input sequence, in 0..P-1
    o.new_line() << "const size_t k = i % p;";
    o.new_line() << "const size_t batch_offset = " << o.global_id(1) << " * threads * " << radix.value << ";";

    // read
    o.new_line() << "x += i + batch_offset;";
    for(size_t i = 0; i < radix.value; ++i)
        o.new_line() << type_name<T2>() << " v" << i << " = x[" << i << " * threads];";

    // twiddle
    o.new_line() << "if(p != 1)";
    o.open("{");
    for(size_t i = 1; i < radix.value; ++i) {
        const T alpha = -boost::math::constants::two_pi<T>() * i / radix.value;
        o.new_line() << "v" << i << " = mul(v" << i << ", twiddle("
          << "(" << type_name<T>() << ")" << std::setprecision(16) << alpha << " * k / p));";
    }
    o.close("}");

    // inplace DFT
    o.new_line() << "dft" << radix.value;
    param_list(o, "&", 0, radix.value);
    o << ";";

    // write back
    o.new_line() << "const size_t j = k + (i - k) * " << radix.value << ";";
    o.new_line() << "y += j + batch_offset;";
    for(size_t i = 0; i < radix.value; i++)
        o.new_line() << "y[" << i << " * p] = v" << i << ";";
    o.end_kernel();
}
示例#2
0
inline void kernel_radix(std::ostringstream &o, pow radix, bool invert) {
    o << in_place_dft(radix.value, invert);

    // kernel.
    o << "__kernel void radix(__global const real2_t *x, __global real2_t *y, uint p, uint threads) {\n"
      << "  const size_t i = get_global_id(0);\n"
      << "  if(i >= threads) return;\n"
        // index in input sequence, in 0..P-1
      << "  const size_t k = i % p;\n"
      << "  const size_t batch_offset = get_global_id(1) * threads * " << radix.value << ";\n";

    // read
    o << "  x += i + batch_offset;\n";
    for(size_t i = 0 ; i < radix.value ; i++)
        o << "  real2_t v" << i << " = x[" << i << " * threads];\n";

    // twiddle
    o << "  if(p != 1) {\n";
    for(size_t i = 1 ; i < radix.value ; i++) {
        const T alpha = -boost::math::constants::two_pi<T>() * i / radix.value;
        o << "    v" << i << " = mul(v" << i << ", twiddle("
          << "(real_t)" << alpha << " * k / p));\n";
    }
    o << "  }\n";

    // inplace DFT
    o << "  dft" << radix.value;
    param_list(o, "&", 0, radix.value);
    o << ";\n";

    // write back
    o << "  const size_t j = k + (i - k) * " << radix.value << ";\n";
    o << "  y += j + batch_offset;\n";
    for(size_t i = 0 ; i < radix.value ; i++)
        o << "  y[" << i << " * p] = v" << i << ";\n";
    o << "}\n";
}