Пример #1
0
inline kernel_call bluestein_pad_kernel(const cl::CommandQueue &queue, size_t n, size_t m, const cl::Buffer &in, const cl::Buffer &out) {
    std::ostringstream o;
    kernel_common<T>(o, qdev(queue));

    o << "real2_t conj(real2_t v) {\n"
      << "  return (real2_t)(v.x, -v.y);\n"
      << "}\n";
    o << "__kernel void bluestein_pad_kernel("
      << "__global real2_t *input, __global real2_t *output, uint n, uint m) {\n"
      << "  const size_t x = get_global_id(0);\n"
      << "  if(x < n || m - x < n)\n"
      << "    output[x] = conj(input[min(x, m - x)]);\n"
      << "  else\n"
      << "    output[x] = (real2_t)(0,0);\n"
      << "}\n";

    auto program = build_sources(qctx(queue), o.str());
    cl::Kernel kernel(program, "bluestein_pad_kernel");
    kernel.setArg(0, in);
    kernel.setArg(1, out);
    kernel.setArg(2, static_cast<cl_uint>(n));
    kernel.setArg(3, static_cast<cl_uint>(m));

    std::ostringstream desc;
    desc << "bluestein_pad_kernel{n=" << n << ", m=" << m << "}";
    return kernel_call(true, desc.str(), program, kernel, cl::NDRange(m), cl::NullRange);
}
Пример #2
0
inline kernel_call bluestein_mul(const cl::CommandQueue &queue, size_t n, size_t batch, const cl::Buffer &data, const cl::Buffer &exp, const cl::Buffer &out) {
    std::ostringstream o;
    kernel_common<T>(o, qdev(queue));
    mul_code(o, false);

    o << "__kernel void bluestein_mul("
      << "__global const real2_t *data, __global const real2_t *exp, __global real2_t *output, uint stride) {\n"
      << "  const size_t x = get_global_id(0), y = get_global_id(1);\n"
      << "  if(x < stride) {\n"
      << "    const size_t off = x + stride * y;"
      << "    output[off] = mul(data[off], exp[x]);\n"
      << "  }\n"
      << "}\n";

    auto program = build_sources(qctx(queue), o.str());
    cl::Kernel kernel(program, "bluestein_mul");
    kernel.setArg(0, data);
    kernel.setArg(1, exp);
    kernel.setArg(2, out);
    kernel.setArg(3, static_cast<cl_uint>(n));

    const size_t wg = kernel.getWorkGroupInfo<CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE>(qdev(queue));
    const size_t threads = alignup(n, wg);

    std::ostringstream desc;
    desc << "bluestein_mul{n=" << n << "(" << threads << "), wg=" << wg << ", batch=" << batch << "}";
    return kernel_call(false, desc.str(), program, kernel, cl::NDRange(threads, batch), cl::NDRange(wg, 1));
}
Пример #3
0
inline kernel_call radix_kernel(bool once, const cl::CommandQueue &queue, size_t n, size_t batch, bool invert, pow radix, size_t p, const cl::Buffer &in, const cl::Buffer &out) {
    std::ostringstream o;
    o << std::setprecision(25);
    const auto device = qdev(queue);
    kernel_common<T>(o, device);
    mul_code(o, invert);
    twiddle_code<T>(o);

    const size_t m = n / radix.value;
    kernel_radix<T>(o, radix, invert);

    auto program = build_sources(qctx(queue), o.str(), "-cl-mad-enable -cl-fast-relaxed-math");
    cl::Kernel kernel(program, "radix");
    kernel.setArg(0, in);
    kernel.setArg(1, out);
    kernel.setArg(2, static_cast<cl_uint>(p));
    kernel.setArg(3, static_cast<cl_uint>(m));

    const size_t wg_mul = kernel.getWorkGroupInfo<CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE>(device);
    //const size_t max_cu = device.getInfo<CL_DEVICE_MAX_COMPUTE_UNITS>();
    //const size_t max_wg = device.getInfo<CL_DEVICE_MAX_WORK_GROUP_SIZE>();
    size_t wg = wg_mul;
    //while(wg * max_cu < max_wg) wg += wg_mul;
    //wg -= wg_mul;
    const size_t threads = alignup(m, wg);

    std::ostringstream desc;
    desc << "dft{r=" << radix << ", p=" << p << ", n=" << n << ", batch=" << batch << ", threads=" << m << "(" << threads << "), wg=" << wg << "}";

    return kernel_call(once, desc.str(), program, kernel, cl::NDRange(threads, batch), cl::NDRange(wg, 1));
}
Пример #4
0
inline kernel_call transpose_kernel(const cl::CommandQueue &queue, size_t width, size_t height, const cl::Buffer &in, const cl::Buffer &out) {
    std::ostringstream o;
    const auto dev = qdev(queue);
    kernel_common<T>(o, dev);

    // determine max block size to fit into local memory/workgroup
    size_t block_size = 128;
    {
        const auto local_size = dev.getInfo<CL_DEVICE_LOCAL_MEM_SIZE>();
        const auto workgroup = dev.getInfo<CL_DEVICE_MAX_WORK_GROUP_SIZE>();
        while(block_size * block_size * sizeof(T) * 2 > local_size) block_size /= 2;
        while(block_size * block_size > workgroup) block_size /= 2;
    }

    // from NVIDIA SDK.
    o << "__kernel void transpose("
      << "__global const real2_t *input, __global real2_t *output, uint width, uint height) {\n"
      << "  const size_t "
      << "    global_x = get_global_id(0), global_y = get_global_id(1),\n"
      << "    local_x = get_local_id(0), local_y = get_local_id(1),\n"
      << "    group_x = get_group_id(0), group_y = get_group_id(1),\n"
      << "    block_size = " << block_size << ",\n"
      << "    target_x = local_y + group_y * block_size,\n"
      << "    target_y = local_x + group_x * block_size;\n"
      << "  const bool range = global_x < width && global_y < height;\n"
        // local memory
      << "  __local real2_t block[" << (block_size * block_size) << "];\n"
        // copy from input to local memory
      << "  if(range)\n"
      << "    block[local_x + local_y * block_size] = input[global_x + global_y * width];\n"
        // wait until the whole block is filled
      << "  barrier(CLK_LOCAL_MEM_FENCE);\n"
        // transpose local block to target
      << "  if(range)\n"
      << "    output[target_x + target_y * height] = block[local_x + local_y * block_size];\n"
      << "}\n";

    auto program = build_sources(qctx(queue), o.str());
    cl::Kernel kernel(program, "transpose");
    kernel.setArg(0, in);
    kernel.setArg(1, out);
    kernel.setArg(2, static_cast<cl_uint>(width));
    kernel.setArg(3, static_cast<cl_uint>(height));

    // range multiple of wg size, last block maybe not completely filled.
    size_t r_w = alignup(width, block_size);
    size_t r_h = alignup(height, block_size);

    std::ostringstream desc;
    desc << "transpose{"
         << "w=" << width << "(" << r_w << "), "
         << "h=" << height << "(" << r_h << "), "
         << "bs=" << block_size << "}";

    return kernel_call(false, desc.str(), program, kernel, cl::NDRange(r_w, r_h),
        cl::NDRange(block_size, block_size));
}
Пример #5
0
inline kernel_call bluestein_pad_kernel(
        const backend::command_queue &queue, size_t n, size_t m,
        const backend::device_vector<T2> &in,
        const backend::device_vector<T2> &out
        )
{
    scoped_program_header header(queue, fft_kernel_header<T>());

    backend::source_generator o(queue);
    o << std::setprecision(25);

    o.begin_function<T2>("conj");
    o.begin_function_parameters();
    o.template parameter<T2>("v");
    o.end_function_parameters();
    o.new_line() << type_name<T2>() << " r = {v.x, -v.y};";
    o.new_line() << "return r;";
    o.end_function();

    o.begin_kernel("bluestein_pad_kernel");
    o.begin_kernel_parameters();
    o.template parameter< global_ptr<const T2> >("input");
    o.template parameter< global_ptr<      T2> >("output");
    o.template parameter< cl_uint              >("n");
    o.template parameter< cl_uint              >("m");
    o.end_kernel_parameters();
    o.new_line() << "const uint x = " << o.global_id(0) << ";";
    o.new_line() << "if (x < m)";
    o.open("{");
    o.new_line() << "if(x < n || m - x < n)";
    o.open("{");
    o.new_line() << "output[x] = conj(input[min(x, m - x)]);";
    o.close("}");
    o.new_line() << "else";
    o.open("{");
    o.new_line() << type_name<T2>() << " r = {0,0};";
    o.new_line() << "output[x] = r;";
    o.close("}");
    o.close("}");
    o.end_kernel();

    backend::kernel kernel(queue, o.str(), "bluestein_pad_kernel");
    kernel.push_arg(in);
    kernel.push_arg(out);
    kernel.push_arg(static_cast<cl_uint>(n));
    kernel.push_arg(static_cast<cl_uint>(m));

    size_t ws = kernel.preferred_work_group_size_multiple(queue);
    size_t gs = (m + ws - 1) / ws;

    kernel.config(gs, ws);

    std::ostringstream desc;
    desc << "bluestein_pad_kernel{n=" << n << ", m=" << m << "}";
    return kernel_call(true, desc.str(), kernel);
}
Пример #6
0
inline kernel_call bluestein_mul(
        const backend::command_queue &queue, size_t n, size_t batch,
        const backend::device_vector<T2> &data,
        const backend::device_vector<T2> &exp,
        const backend::device_vector<T2> &out
        )
{
    scoped_program_header header(queue, fft_kernel_header<T>());

    backend::source_generator o(queue);
    o << std::setprecision(25);

    mul_code<T2>(o, false);

    o.begin_kernel("bluestein_mul");
    o.begin_kernel_parameters();
    o.template parameter< global_ptr<const T2> >("data");
    o.template parameter< global_ptr<const T2> >("exp");
    o.template parameter< global_ptr<      T2> >("output");
    o.template parameter< cl_uint              >("stride");
    o.end_kernel_parameters();

    o.new_line() << "const size_t x = " << o.global_id(0) << ";";
    o.new_line() << "const size_t y = " << o.global_id(1) << ";";

    o.new_line() << "if(x < stride)";
    o.open("{");

    o.new_line() << "const size_t off = x + stride * y;";
    o.new_line() << "output[off] = mul(data[off], exp[x]);";

    o.close("}");
    o.end_kernel();

    backend::kernel kernel(queue, o.str(), "bluestein_mul");
    kernel.push_arg(data);
    kernel.push_arg(exp);
    kernel.push_arg(out);
    kernel.push_arg(static_cast<cl_uint>(n));

    const size_t wg = kernel.preferred_work_group_size_multiple(queue);
    const size_t threads = (n + wg - 1) / wg;

    kernel.config(backend::ndrange(threads, batch), backend::ndrange(wg, 1));

    std::ostringstream desc;
    desc << "bluestein_mul{n=" << n << "(" << threads << "), wg=" << wg << ", batch=" << batch << "}";
    return kernel_call(false, desc.str(), kernel);
}
Пример #7
0
inline kernel_call bluestein_mul_in(const cl::CommandQueue &queue, bool inverse, size_t batch, size_t radix, size_t p, size_t threads, size_t stride, const cl::Buffer &data, const cl::Buffer &exp, const cl::Buffer &out) {
    std::ostringstream o;
    kernel_common<T>(o, qdev(queue));
    mul_code(o, false);
    twiddle_code<T>(o);

    o << "__kernel void bluestein_mul_in("
      << "__global const real2_t *data, __global const real2_t *exp, __global real2_t *output, "
      << "uint radix, uint p, uint out_stride) {\n"
      << "  const size_t\n"
      << "    thread = get_global_id(0), threads = get_global_size(0),\n"
      << "    batch = get_global_id(1),\n"
      << "    element = get_global_id(2);\n"
      << "  if(element < out_stride) {\n"
      << "    const size_t\n"
      << "      in_off = thread + batch * radix * threads + element * threads,\n"
      << "      out_off = thread * out_stride + batch * out_stride * threads + element;\n"
      << "    if(element < radix) {\n"
      << "      real2_t w = exp[element];"
      << "      if(p != 1) {\n"
      << "        const int sign = " << (inverse ? "+1" : "-1") << ";\n"
      << "        ulong a = (ulong)element * (thread % p);\n"
      << "        ulong b = (ulong)radix * p;\n"
      << "        real2_t t = twiddle(2 * sign * M_PI * (a % (2 * b)) / b);\n"
      << "        w = mul(w, t);\n"
      << "      }\n"
      << "      output[out_off] = mul(data[in_off], w);\n"
      << "    } else\n"
      << "      output[out_off] = (real2_t)(0,0);"
      << "  }\n"
      << "}\n";

    auto program = build_sources(qctx(queue), o.str());
    cl::Kernel kernel(program, "bluestein_mul_in");
    kernel.setArg(0, data);
    kernel.setArg(1, exp);
    kernel.setArg(2, out);
    kernel.setArg(3, static_cast<cl_uint>(radix));
    kernel.setArg(4, static_cast<cl_uint>(p));
    kernel.setArg(5, static_cast<cl_uint>(stride));

    const size_t wg = kernel.getWorkGroupInfo<CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE>(qdev(queue));
    const size_t stride_pad = alignup(stride, wg);

    std::ostringstream desc;
    desc << "bluestein_mul_in{batch=" << batch << ", radix=" << radix << ", p=" << p << ", threads=" << threads << ", stride=" << stride << "(" << stride_pad << "), wg=" << wg << "}";
    return kernel_call(false, desc.str(), program, kernel, cl::NDRange(threads, batch, stride_pad), cl::NDRange(1, 1, wg));
}
Пример #8
0
inline kernel_call radix_kernel(
        bool once, const backend::command_queue &queue, size_t n, size_t batch,
        bool invert, pow radix, size_t p,
        const backend::device_vector<T2> &in,
        const backend::device_vector<T2> &out
        )
{
    backend::source_generator o;
    o << std::setprecision(25);
    kernel_common<T>(o, queue);
    mul_code<T2>(o, invert);
    twiddle_code<T, T2>(o);

    const size_t m = n / radix.value;
    kernel_radix<T, T2>(o, radix, invert);

    backend::kernel kernel(queue, o.str(), "radix", 0,
#ifndef VEXCL_BACKEND_CUDA
            "-cl-mad-enable -cl-fast-relaxed-math"
#else
            "--use_fast_math"
#endif
            );

    kernel.push_arg(in);
    kernel.push_arg(out);
    kernel.push_arg(static_cast<cl_uint>(p));
    kernel.push_arg(static_cast<cl_uint>(m));

    const size_t wg_mul = kernel.preferred_work_group_size_multiple(queue);
    //const size_t max_cu = device.getInfo<CL_DEVICE_MAX_COMPUTE_UNITS>();
    //const size_t max_wg = device.getInfo<CL_DEVICE_MAX_WORK_GROUP_SIZE>();
    size_t wg = wg_mul;
    //while(wg * max_cu < max_wg) wg += wg_mul;
    //wg -= wg_mul;
    const size_t threads = (m + wg - 1) / wg;

    kernel.config(backend::ndrange(threads, batch), backend::ndrange(wg, 1));

    std::ostringstream desc;
    desc << "dft{r=" << radix << ", p=" << p << ", n=" << n << ", batch=" << batch << ", threads=" << m << "(" << threads << "), wg=" << wg << "}";

    return kernel_call(once, desc.str(), kernel);
}
Пример #9
0
int main()
{
  int width = 320;
  int height = 112;

  int *a = (int*)malloc(width*height*sizeof(int));

  int pitch = sizeof(int)*width;

  auto acc = hc::accelerator();
  int* a_d = (int*)hc::am_alloc(width*height*sizeof(int), acc, 0);

  grid_launch_parm lp;
  grid_launch_init(&lp);

  lp.grid_dim = gl_dim3(width/TILE_I, height/TILE_J);
  lp.group_dim = gl_dim3(TILE_I, TILE_J);

  hc::completion_future cf;
  lp.cf = &cf;
  kernel_call(lp, a_d, pitch);
  lp.cf->wait();

  static hc::accelerator_view av = acc.get_default_view();
  av.copy(a_d, a, width*height*sizeof(int));

  int ret = 0;
  for(int i = 0; i < width*height; ++i)
  {
    if(a[i] != i)
      ret++;
  }
  if(ret != 0)
  {
    printf("errors: %d\n", ret);
    return 1;
  }

  hc::am_free(a_d);
  free(a);

  return 0;
}
Пример #10
0
inline kernel_call bluestein_twiddle(const cl::CommandQueue &queue, size_t n, bool inverse, const cl::Buffer &out) {
    std::ostringstream o;
    kernel_common<T>(o, qdev(queue));
    twiddle_code<T>(o);

    o << "__kernel void bluestein_twiddle(__global real2_t *output) {\n"
      << "  const size_t x = get_global_id(0), n = get_global_size(0);\n"
      << "  const int sign = " << (inverse ? "+1" : "-1") << ";\n"
      << "  const size_t xx = ((ulong)x * x) % (2 * n);\n"
      << "  output[x] = twiddle(sign * M_PI * xx / n);\n"
      << "}\n";

    auto program = build_sources(qctx(queue), o.str());
    cl::Kernel kernel(program, "bluestein_twiddle");
    kernel.setArg(0, out);

    std::ostringstream desc;
    desc << "bluestein_twiddle{n=" << n << ", inverse=" << inverse << "}";
    return kernel_call(true, desc.str(), program, kernel, cl::NDRange(n), cl::NullRange);
}
Пример #11
0
inline kernel_call bluestein_twiddle(
        const backend::command_queue &queue, size_t n, bool inverse,
        const backend::device_vector<T2> &out
        )
{
    scoped_program_header header(queue, fft_kernel_header<T>());

    backend::source_generator o(queue);
    o << std::setprecision(25);

    twiddle_code<T, T2>(o);

    o.begin_kernel("bluestein_twiddle");
    o.begin_kernel_parameters();
    o.template parameter< size_t         >("n");
    o.template parameter< global_ptr<T2> >("output");
    o.end_kernel_parameters();

    o.new_line() << "const size_t x = " << o.global_id(0) << ";";

    o.new_line() << "const size_t xx = ((ulong)x * x) % (2 * n);";
    o.new_line() << "if (x < n) output[x] = twiddle("
        << std::setprecision(16)
        << (inverse ? 1 : -1) * boost::math::constants::pi<T>()
        << " * xx / n);";

    o.end_kernel();

    backend::kernel kernel(queue, o.str(), "bluestein_twiddle");
    kernel.push_arg(n);
    kernel.push_arg(out);

    size_t ws = kernel.preferred_work_group_size_multiple(queue);
    size_t gs = (n + ws - 1) / ws;

    kernel.config(gs, ws);

    std::ostringstream desc;
    desc << "bluestein_twiddle{n=" << n << ", inverse=" << inverse << "}";
    return kernel_call(true, desc.str(), kernel);
}
Пример #12
0
inline kernel_call bluestein_mul_out(const cl::CommandQueue &queue, size_t batch, size_t p, size_t radix, size_t threads, size_t stride, const cl::Buffer &data, const cl::Buffer &exp, const cl::Buffer &out) {
    std::ostringstream o;
    kernel_common<T>(o, qdev(queue));
    mul_code(o, false);

    o << "__kernel void bluestein_mul_out("
      << "__global const real2_t *data, __global const real2_t *exp, __global real2_t *output, "
      << "real_t div, uint p, uint in_stride, uint radix) {\n"
      << "  const size_t\n"
      << "    i = get_global_id(0), threads = get_global_size(0),\n"
      << "    b = get_global_id(1),\n"
      << "    l = get_global_id(2);\n"
      << "  if(l < radix) {\n"
      << "    const size_t\n"
      << "      k = i % p,\n"
      << "      j = k + (i - k) * radix,\n"
      << "      in_off = i * in_stride + b * in_stride * threads + l,\n"
      << "      out_off = j + b * threads * radix + l * p;\n"
      << "    output[out_off] = mul(data[in_off] * div, exp[l]);\n"
      << "  }\n"
      << "}\n";

    auto program = build_sources(qctx(queue), o.str());
    cl::Kernel kernel(program, "bluestein_mul_out");
    kernel.setArg(0, data);
    kernel.setArg(1, exp);
    kernel.setArg(2, out);
    kernel.setArg<T>(3, static_cast<T>(1) / stride);
    kernel.setArg(4, static_cast<cl_uint>(p));
    kernel.setArg(5, static_cast<cl_uint>(stride));
    kernel.setArg(6, static_cast<cl_uint>(radix));

    const size_t wg = kernel.getWorkGroupInfo<CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE>(qdev(queue));
    const size_t radix_pad = alignup(radix, wg);

    std::ostringstream desc;
    desc << "bluestein_mul_out{r=" << radix << "(" << radix_pad << "), wg=" << wg << ", batch=" << batch << ", p=" << p << ", thr=" << threads << ", stride=" << stride << "}";
    return kernel_call(false, desc.str(), program, kernel, cl::NDRange(threads, batch, radix_pad), cl::NDRange(1, 1, wg));
}
Пример #13
0
inline kernel_call bluestein_mul_out(
        const backend::command_queue &queue, size_t batch, size_t p,
        size_t radix, size_t threads, size_t stride,
        const backend::device_vector<T2> &data,
        const backend::device_vector<T2> &exp,
        const backend::device_vector<T2> &out
        )
{
    backend::source_generator o;
    kernel_common<T>(o, queue);
    mul_code<T2>(o, false);

    o.function<T2>("scale").open("(")
        .template parameter<T2>("x")
        .template parameter<T >("a")
    .close(")").open("{");

    o.new_line() << type_name<T2>() << " r = {x.x * a, x.y * a};";
    o.new_line() << "return r;";
    o.close("}");

    o.kernel("bluestein_mul_out").open("(")
        .template parameter< global_ptr<const T2> >("data")
        .template parameter< global_ptr<const T2> >("exp")
        .template parameter< global_ptr<      T2> >("output")
        .template parameter< T                    >("div")
        .template parameter< cl_uint              >("p")
        .template parameter< cl_uint              >("in_stride")
        .template parameter< cl_uint              >("radix")
    .close(")").open("{");

    o.new_line() << "const size_t i = " << o.global_id(0) << ";";
    o.new_line() << "const size_t threads = " << o.global_size(0) << ";";
    o.new_line() << "const size_t b = " << o.global_id(1) << ";";
    o.new_line() << "const size_t l = " << o.global_id(2) << ";";

    o.new_line() << "if(l < radix)";
    o.open("{");

    o.new_line() << "const size_t k = i % p;";
    o.new_line() << "const size_t j = k + (i - k) * radix;";
    o.new_line() << "const size_t in_off = i * in_stride + b * in_stride * threads + l;";
    o.new_line() << "const size_t out_off = j + b * threads * radix + l * p;";

    o.new_line() << "output[out_off] = mul(scale(data[in_off], div), exp[l]);";

    o.close("}");
    o.close("}");

    backend::kernel kernel(queue, o.str(), "bluestein_mul_out");
    kernel.push_arg(data);
    kernel.push_arg(exp);
    kernel.push_arg(out);
    kernel.push_arg(static_cast<T>(1.0 / stride));
    kernel.push_arg(static_cast<cl_uint>(p));
    kernel.push_arg(static_cast<cl_uint>(stride));
    kernel.push_arg(static_cast<cl_uint>(radix));

    const size_t wg = kernel.preferred_work_group_size_multiple(queue);
    const size_t radix_pad = (radix + wg - 1) / wg;

    kernel.config(
            backend::ndrange(threads, batch, radix_pad),
            backend::ndrange(      1,     1,        wg)
            );

    std::ostringstream desc;
    desc << "bluestein_mul_out{r=" << radix << "(" << radix_pad << "), wg=" << wg << ", batch=" << batch << ", p=" << p << ", thr=" << threads << ", stride=" << stride << "}";
    return kernel_call(false, desc.str(), kernel);
}
Пример #14
0
inline kernel_call bluestein_mul_in(
        const backend::command_queue &queue, bool inverse, size_t batch,
        size_t radix, size_t p, size_t threads, size_t stride,
        const backend::device_vector<T2> &data,
        const backend::device_vector<T2> &exp,
        const backend::device_vector<T2> &out
        )
{
    backend::source_generator o;
    kernel_common<T>(o, queue);
    mul_code<T2>(o, false);
    twiddle_code<T, T2>(o);

    o.kernel("bluestein_mul_in").open("(")
        .template parameter< global_ptr<const T2> >("data")
        .template parameter< global_ptr<const T2> >("exp")
        .template parameter< global_ptr<      T2> >("output")
        .template parameter< cl_uint              >("radix")
        .template parameter< cl_uint              >("p")
        .template parameter< cl_uint              >("out_stride")
    .close(")").open("{");

    o.new_line() << "const size_t thread  = " << o.global_id(0)   << ";";
    o.new_line() << "const size_t threads = " << o.global_size(0) << ";";
    o.new_line() << "const size_t batch   = " << o.global_id(1)   << ";";
    o.new_line() << "const size_t element = " << o.global_id(2)   << ";";

    o.new_line() << "if(element < out_stride)";
    o.open("{");

    o.new_line() << "const size_t in_off  = thread + batch * radix * threads + element * threads;";
    o.new_line() << "const size_t out_off = thread * out_stride + batch * out_stride * threads + element;";

    o.new_line() << "if(element < radix)";
    o.open("{");

    o.new_line() << type_name<T2>() << " w = exp[element];";

    o.new_line() << "if(p != 1)";
    o.open("{");

    o.new_line() << "ulong a = (ulong)element * (thread % p);";
    o.new_line() << "ulong b = (ulong)radix * p;";
    o.new_line() << type_name<T2>() << " t = twiddle(" << std::setprecision(16)
        << (inverse ? 1 : -1) * boost::math::constants::two_pi<T>()
        << " * (a % (2 * b)) / b);";
    o.new_line() << "w = mul(w, t);";
    o.close("}");

    o.new_line() << "output[out_off] = mul(data[in_off], w);";

    o.close("}");
    o.new_line() << "else";
    o.open("{");

    o.new_line() << type_name<T2>() << " r = {0,0};";
    o.new_line() << "output[out_off] = r;";

    o.close("}");
    o.close("}");
    o.close("}");

    backend::kernel kernel(queue, o.str(), "bluestein_mul_in");
    kernel.push_arg(data);
    kernel.push_arg(exp);
    kernel.push_arg(out);
    kernel.push_arg(static_cast<cl_uint>(radix));
    kernel.push_arg(static_cast<cl_uint>(p));
    kernel.push_arg(static_cast<cl_uint>(stride));

    const size_t wg = kernel.preferred_work_group_size_multiple(queue);
    const size_t stride_pad = (stride + wg - 1) / wg;

    kernel.config(
            backend::ndrange(threads, batch, stride_pad),
            backend::ndrange(      1,     1,         wg)
            );

    std::ostringstream desc;
    desc << "bluestein_mul_in{batch=" << batch << ", radix=" << radix << ", p=" << p << ", threads=" << threads << ", stride=" << stride << "(" << stride_pad << "), wg=" << wg << "}";
    return kernel_call(false, desc.str(), kernel);
}
Пример #15
0
inline kernel_call transpose_kernel(
        const backend::command_queue &queue, size_t width, size_t height,
        const backend::device_vector<T2> &in,
        const backend::device_vector<T2> &out
        )
{
    backend::source_generator o;
    kernel_common<T>(o, queue);

    // determine max block size to fit into local memory/workgroup
    size_t block_size = 128;
    {
#ifndef VEXCL_BACKEND_CUDA
        cl_device_id dev = backend::get_device_id(queue);
        cl_ulong local_size;
        size_t workgroup;
        clGetDeviceInfo(dev, CL_DEVICE_LOCAL_MEM_SIZE, sizeof(cl_ulong), &local_size, NULL);
        clGetDeviceInfo(dev, CL_DEVICE_MAX_WORK_GROUP_SIZE, sizeof(size_t), &workgroup, NULL);
#else
        const auto local_size = queue.device().max_shared_memory_per_block();
        const auto workgroup = queue.device().max_threads_per_block();
#endif
        while(block_size * block_size * sizeof(T) * 2 > local_size) block_size /= 2;
        while(block_size * block_size > workgroup) block_size /= 2;
    }

    // from NVIDIA SDK.
    o.kernel("transpose").open("(")
        .template parameter< global_ptr<const T2> >("input")
        .template parameter< global_ptr<      T2> >("output")
        .template parameter< cl_uint              >("width")
        .template parameter< cl_uint              >("height")
    .close(")").open("{");

    o.new_line() << "const size_t global_x = " << o.global_id(0) << ";";
    o.new_line() << "const size_t global_y = " << o.global_id(1) << ";";
    o.new_line() << "const size_t local_x  = " << o.local_id(0)  << ";";
    o.new_line() << "const size_t local_y  = " << o.local_id(1)  << ";";
    o.new_line() << "const size_t group_x  = " << o.group_id(0)  << ";";
    o.new_line() << "const size_t group_y  = " << o.group_id(1)  << ";";
    o.new_line() << "const size_t target_x = local_y + group_y * " << block_size << ";";
    o.new_line() << "const size_t target_y = local_x + group_x * " << block_size << ";";
    o.new_line() << "const bool range = global_x < width && global_y < height;";

    // local memory
    {
        std::ostringstream s;
        s << "block[" << block_size * block_size << "]";
        o.smem_static_var(type_name<T2>(), s.str());
    }

    // copy from input to local memory
    o.new_line() << "if(range) "
        << "block[local_x + local_y * " << block_size << "] = input[global_x + global_y * width];";

    // wait until the whole block is filled
    o.new_line().barrier();

    // transpose local block to target
    o.new_line() << "if(range) "
      << "output[target_x + target_y * height] = block[local_x + local_y * " << block_size << "];";

    o.close("}");

    backend::kernel kernel(queue, o.str(), "transpose");

    kernel.push_arg(in);
    kernel.push_arg(out);
    kernel.push_arg(static_cast<cl_uint>(width));
    kernel.push_arg(static_cast<cl_uint>(height));

    // range multiple of wg size, last block maybe not completely filled.
    size_t r_w = (width  + block_size - 1) / block_size;
    size_t r_h = (height + block_size - 1) / block_size;

    kernel.config(backend::ndrange(r_w, r_h), backend::ndrange(block_size, block_size));

    std::ostringstream desc;
    desc << "transpose{"
         << "w=" << width << "(" << r_w << "), "
         << "h=" << height << "(" << r_h << "), "
         << "bs=" << block_size << "}";

    return kernel_call(false, desc.str(), kernel);
}