예제 #1
0
파일: kernels.hpp 프로젝트: d-meiser/vexcl
inline kernel_call bluestein_pad_kernel(const cl::CommandQueue &queue, size_t n, size_t m, const cl::Buffer &in, const cl::Buffer &out) {
    std::ostringstream o;
    kernel_common<T>(o, qdev(queue));

    o << "real2_t conj(real2_t v) {\n"
      << "  return (real2_t)(v.x, -v.y);\n"
      << "}\n";
    o << "__kernel void bluestein_pad_kernel("
      << "__global real2_t *input, __global real2_t *output, uint n, uint m) {\n"
      << "  const size_t x = get_global_id(0);\n"
      << "  if(x < n || m - x < n)\n"
      << "    output[x] = conj(input[min(x, m - x)]);\n"
      << "  else\n"
      << "    output[x] = (real2_t)(0,0);\n"
      << "}\n";

    auto program = build_sources(qctx(queue), o.str());
    cl::Kernel kernel(program, "bluestein_pad_kernel");
    kernel.setArg(0, in);
    kernel.setArg(1, out);
    kernel.setArg(2, static_cast<cl_uint>(n));
    kernel.setArg(3, static_cast<cl_uint>(m));

    std::ostringstream desc;
    desc << "bluestein_pad_kernel{n=" << n << ", m=" << m << "}";
    return kernel_call(true, desc.str(), program, kernel, cl::NDRange(m), cl::NullRange);
}
예제 #2
0
파일: kernels.hpp 프로젝트: d-meiser/vexcl
inline kernel_call bluestein_mul(const cl::CommandQueue &queue, size_t n, size_t batch, const cl::Buffer &data, const cl::Buffer &exp, const cl::Buffer &out) {
    std::ostringstream o;
    kernel_common<T>(o, qdev(queue));
    mul_code(o, false);

    o << "__kernel void bluestein_mul("
      << "__global const real2_t *data, __global const real2_t *exp, __global real2_t *output, uint stride) {\n"
      << "  const size_t x = get_global_id(0), y = get_global_id(1);\n"
      << "  if(x < stride) {\n"
      << "    const size_t off = x + stride * y;"
      << "    output[off] = mul(data[off], exp[x]);\n"
      << "  }\n"
      << "}\n";

    auto program = build_sources(qctx(queue), o.str());
    cl::Kernel kernel(program, "bluestein_mul");
    kernel.setArg(0, data);
    kernel.setArg(1, exp);
    kernel.setArg(2, out);
    kernel.setArg(3, static_cast<cl_uint>(n));

    const size_t wg = kernel.getWorkGroupInfo<CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE>(qdev(queue));
    const size_t threads = alignup(n, wg);

    std::ostringstream desc;
    desc << "bluestein_mul{n=" << n << "(" << threads << "), wg=" << wg << ", batch=" << batch << "}";
    return kernel_call(false, desc.str(), program, kernel, cl::NDRange(threads, batch), cl::NDRange(wg, 1));
}
예제 #3
0
파일: kernels.hpp 프로젝트: d-meiser/vexcl
inline kernel_call radix_kernel(bool once, const cl::CommandQueue &queue, size_t n, size_t batch, bool invert, pow radix, size_t p, const cl::Buffer &in, const cl::Buffer &out) {
    std::ostringstream o;
    o << std::setprecision(25);
    const auto device = qdev(queue);
    kernel_common<T>(o, device);
    mul_code(o, invert);
    twiddle_code<T>(o);

    const size_t m = n / radix.value;
    kernel_radix<T>(o, radix, invert);

    auto program = build_sources(qctx(queue), o.str(), "-cl-mad-enable -cl-fast-relaxed-math");
    cl::Kernel kernel(program, "radix");
    kernel.setArg(0, in);
    kernel.setArg(1, out);
    kernel.setArg(2, static_cast<cl_uint>(p));
    kernel.setArg(3, static_cast<cl_uint>(m));

    const size_t wg_mul = kernel.getWorkGroupInfo<CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE>(device);
    //const size_t max_cu = device.getInfo<CL_DEVICE_MAX_COMPUTE_UNITS>();
    //const size_t max_wg = device.getInfo<CL_DEVICE_MAX_WORK_GROUP_SIZE>();
    size_t wg = wg_mul;
    //while(wg * max_cu < max_wg) wg += wg_mul;
    //wg -= wg_mul;
    const size_t threads = alignup(m, wg);

    std::ostringstream desc;
    desc << "dft{r=" << radix << ", p=" << p << ", n=" << n << ", batch=" << batch << ", threads=" << m << "(" << threads << "), wg=" << wg << "}";

    return kernel_call(once, desc.str(), program, kernel, cl::NDRange(threads, batch), cl::NDRange(wg, 1));
}
예제 #4
0
파일: kernels.hpp 프로젝트: d-meiser/vexcl
inline kernel_call transpose_kernel(const cl::CommandQueue &queue, size_t width, size_t height, const cl::Buffer &in, const cl::Buffer &out) {
    std::ostringstream o;
    const auto dev = qdev(queue);
    kernel_common<T>(o, dev);

    // determine max block size to fit into local memory/workgroup
    size_t block_size = 128;
    {
        const auto local_size = dev.getInfo<CL_DEVICE_LOCAL_MEM_SIZE>();
        const auto workgroup = dev.getInfo<CL_DEVICE_MAX_WORK_GROUP_SIZE>();
        while(block_size * block_size * sizeof(T) * 2 > local_size) block_size /= 2;
        while(block_size * block_size > workgroup) block_size /= 2;
    }

    // from NVIDIA SDK.
    o << "__kernel void transpose("
      << "__global const real2_t *input, __global real2_t *output, uint width, uint height) {\n"
      << "  const size_t "
      << "    global_x = get_global_id(0), global_y = get_global_id(1),\n"
      << "    local_x = get_local_id(0), local_y = get_local_id(1),\n"
      << "    group_x = get_group_id(0), group_y = get_group_id(1),\n"
      << "    block_size = " << block_size << ",\n"
      << "    target_x = local_y + group_y * block_size,\n"
      << "    target_y = local_x + group_x * block_size;\n"
      << "  const bool range = global_x < width && global_y < height;\n"
        // local memory
      << "  __local real2_t block[" << (block_size * block_size) << "];\n"
        // copy from input to local memory
      << "  if(range)\n"
      << "    block[local_x + local_y * block_size] = input[global_x + global_y * width];\n"
        // wait until the whole block is filled
      << "  barrier(CLK_LOCAL_MEM_FENCE);\n"
        // transpose local block to target
      << "  if(range)\n"
      << "    output[target_x + target_y * height] = block[local_x + local_y * block_size];\n"
      << "}\n";

    auto program = build_sources(qctx(queue), o.str());
    cl::Kernel kernel(program, "transpose");
    kernel.setArg(0, in);
    kernel.setArg(1, out);
    kernel.setArg(2, static_cast<cl_uint>(width));
    kernel.setArg(3, static_cast<cl_uint>(height));

    // range multiple of wg size, last block maybe not completely filled.
    size_t r_w = alignup(width, block_size);
    size_t r_h = alignup(height, block_size);

    std::ostringstream desc;
    desc << "transpose{"
         << "w=" << width << "(" << r_w << "), "
         << "h=" << height << "(" << r_h << "), "
         << "bs=" << block_size << "}";

    return kernel_call(false, desc.str(), program, kernel, cl::NDRange(r_w, r_h),
        cl::NDRange(block_size, block_size));
}
예제 #5
0
파일: kernels.hpp 프로젝트: d-meiser/vexcl
inline kernel_call bluestein_mul_in(const cl::CommandQueue &queue, bool inverse, size_t batch, size_t radix, size_t p, size_t threads, size_t stride, const cl::Buffer &data, const cl::Buffer &exp, const cl::Buffer &out) {
    std::ostringstream o;
    kernel_common<T>(o, qdev(queue));
    mul_code(o, false);
    twiddle_code<T>(o);

    o << "__kernel void bluestein_mul_in("
      << "__global const real2_t *data, __global const real2_t *exp, __global real2_t *output, "
      << "uint radix, uint p, uint out_stride) {\n"
      << "  const size_t\n"
      << "    thread = get_global_id(0), threads = get_global_size(0),\n"
      << "    batch = get_global_id(1),\n"
      << "    element = get_global_id(2);\n"
      << "  if(element < out_stride) {\n"
      << "    const size_t\n"
      << "      in_off = thread + batch * radix * threads + element * threads,\n"
      << "      out_off = thread * out_stride + batch * out_stride * threads + element;\n"
      << "    if(element < radix) {\n"
      << "      real2_t w = exp[element];"
      << "      if(p != 1) {\n"
      << "        const int sign = " << (inverse ? "+1" : "-1") << ";\n"
      << "        ulong a = (ulong)element * (thread % p);\n"
      << "        ulong b = (ulong)radix * p;\n"
      << "        real2_t t = twiddle(2 * sign * M_PI * (a % (2 * b)) / b);\n"
      << "        w = mul(w, t);\n"
      << "      }\n"
      << "      output[out_off] = mul(data[in_off], w);\n"
      << "    } else\n"
      << "      output[out_off] = (real2_t)(0,0);"
      << "  }\n"
      << "}\n";

    auto program = build_sources(qctx(queue), o.str());
    cl::Kernel kernel(program, "bluestein_mul_in");
    kernel.setArg(0, data);
    kernel.setArg(1, exp);
    kernel.setArg(2, out);
    kernel.setArg(3, static_cast<cl_uint>(radix));
    kernel.setArg(4, static_cast<cl_uint>(p));
    kernel.setArg(5, static_cast<cl_uint>(stride));

    const size_t wg = kernel.getWorkGroupInfo<CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE>(qdev(queue));
    const size_t stride_pad = alignup(stride, wg);

    std::ostringstream desc;
    desc << "bluestein_mul_in{batch=" << batch << ", radix=" << radix << ", p=" << p << ", threads=" << threads << ", stride=" << stride << "(" << stride_pad << "), wg=" << wg << "}";
    return kernel_call(false, desc.str(), program, kernel, cl::NDRange(threads, batch, stride_pad), cl::NDRange(1, 1, wg));
}
예제 #6
0
파일: kernels.hpp 프로젝트: d-meiser/vexcl
inline kernel_call bluestein_twiddle(const cl::CommandQueue &queue, size_t n, bool inverse, const cl::Buffer &out) {
    std::ostringstream o;
    kernel_common<T>(o, qdev(queue));
    twiddle_code<T>(o);

    o << "__kernel void bluestein_twiddle(__global real2_t *output) {\n"
      << "  const size_t x = get_global_id(0), n = get_global_size(0);\n"
      << "  const int sign = " << (inverse ? "+1" : "-1") << ";\n"
      << "  const size_t xx = ((ulong)x * x) % (2 * n);\n"
      << "  output[x] = twiddle(sign * M_PI * xx / n);\n"
      << "}\n";

    auto program = build_sources(qctx(queue), o.str());
    cl::Kernel kernel(program, "bluestein_twiddle");
    kernel.setArg(0, out);

    std::ostringstream desc;
    desc << "bluestein_twiddle{n=" << n << ", inverse=" << inverse << "}";
    return kernel_call(true, desc.str(), program, kernel, cl::NDRange(n), cl::NullRange);
}
예제 #7
0
파일: kernels.hpp 프로젝트: d-meiser/vexcl
inline kernel_call bluestein_mul_out(const cl::CommandQueue &queue, size_t batch, size_t p, size_t radix, size_t threads, size_t stride, const cl::Buffer &data, const cl::Buffer &exp, const cl::Buffer &out) {
    std::ostringstream o;
    kernel_common<T>(o, qdev(queue));
    mul_code(o, false);

    o << "__kernel void bluestein_mul_out("
      << "__global const real2_t *data, __global const real2_t *exp, __global real2_t *output, "
      << "real_t div, uint p, uint in_stride, uint radix) {\n"
      << "  const size_t\n"
      << "    i = get_global_id(0), threads = get_global_size(0),\n"
      << "    b = get_global_id(1),\n"
      << "    l = get_global_id(2);\n"
      << "  if(l < radix) {\n"
      << "    const size_t\n"
      << "      k = i % p,\n"
      << "      j = k + (i - k) * radix,\n"
      << "      in_off = i * in_stride + b * in_stride * threads + l,\n"
      << "      out_off = j + b * threads * radix + l * p;\n"
      << "    output[out_off] = mul(data[in_off] * div, exp[l]);\n"
      << "  }\n"
      << "}\n";

    auto program = build_sources(qctx(queue), o.str());
    cl::Kernel kernel(program, "bluestein_mul_out");
    kernel.setArg(0, data);
    kernel.setArg(1, exp);
    kernel.setArg(2, out);
    kernel.setArg<T>(3, static_cast<T>(1) / stride);
    kernel.setArg(4, static_cast<cl_uint>(p));
    kernel.setArg(5, static_cast<cl_uint>(stride));
    kernel.setArg(6, static_cast<cl_uint>(radix));

    const size_t wg = kernel.getWorkGroupInfo<CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE>(qdev(queue));
    const size_t radix_pad = alignup(radix, wg);

    std::ostringstream desc;
    desc << "bluestein_mul_out{r=" << radix << "(" << radix_pad << "), wg=" << wg << ", batch=" << batch << ", p=" << p << ", thr=" << threads << ", stride=" << stride << "}";
    return kernel_call(false, desc.str(), program, kernel, cl::NDRange(threads, batch, radix_pad), cl::NDRange(1, 1, wg));
}
예제 #8
0
        typename std::enable_if<
            !boost::proto::matches<
                typename boost::proto::result_of::as_expr<ExprTuple>::type,
                multivector_full_grammar
            >::value
#if !defined(_MSC_VER) || _MSC_VER >= 1700
            && N == std::tuple_size<ExprTuple>::value
#endif
            , const multivector&
        >::type
        operator=(const ExprTuple &expr) {
#endif
            static kernel_cache cache;

            const std::vector<cl::CommandQueue> &queue = vec[0]->queue_list();

            for(uint d = 0; d < queue.size(); d++) {
                cl::Context context = qctx(queue[d]);
                cl::Device  device  = qdev(queue[d]);

                auto kernel = cache.find( context() );

                if (kernel == cache.end()) {
                    std::ostringstream source;

                    source << standard_kernel_header(device);

                    {
                        get_header f(source);
                        for_each<0>(expr, f);
                    }

                    source <<
                        "kernel void multi_expr_tuple(\n"
                        "\t" << type_name<size_t>() << " n";

                    for(uint i = 1; i <= N; i++)
                        source << ",\n\tglobal " << type_name<T>() << " *res_" << i;

                    {
                        get_params f(source);
                        for_each<0>(expr, f);
                    }

                    source << "\n)\n{\n";

                    if ( is_cpu(device) ) {
                        source <<
                            "\tsize_t chunk_size  = (n + get_global_size(0) - 1) / get_global_size(0);\n"
                            "\tsize_t chunk_start = get_global_id(0) * chunk_size;\n"
                            "\tsize_t chunk_end   = min(n, chunk_start + chunk_size);\n"
                            "\tfor(size_t idx = chunk_start; idx < chunk_end; ++idx) {\n";
                    } else {
                        source <<
                            "\tfor(size_t idx = get_global_id(0); idx < n; idx += get_global_size(0)) {\n";
                    }

                    {
                        get_expressions f(source);
                        for_each<0>(expr, f);
                    }

                    source << "\n";

                    for(uint i = 1; i <= N; i++)
                        source << "\t\tres_" << i << "[idx] = buf_" << i << ";\n";

                    source << "\t}\n}\n";

                    auto program = build_sources(context, source.str());

                    cl::Kernel krn(program, "multi_expr_tuple");
                    size_t wgs = kernel_workgroup_size(krn, device);

                    kernel = cache.insert(std::make_pair(
                                context(), kernel_cache_entry(krn, wgs)
                                )).first;
                }

                if (size_t psize = vec[0]->part_size(d)) {
                    size_t w_size = kernel->second.wgsize;
                    size_t g_size = num_workgroups(device) * w_size;

                    uint pos = 0;
                    kernel->second.kernel.setArg(pos++, psize);

                    for(uint i = 0; i < N; i++)
                        kernel->second.kernel.setArg(pos++, (*vec[i])(d));

                    {
                        set_arguments f(kernel->second.kernel, d, pos, vec[0]->part_start(d));
                        for_each<0>(expr, f);
                    }

                    queue[d].enqueueNDRangeKernel(
                            kernel->second.kernel, cl::NullRange, g_size, w_size
                            );
                }
            }

            return *this;
        }
예제 #9
0
        typename std::enable_if<
            boost::proto::matches<
                typename boost::proto::result_of::as_expr<Expr>::type,
                multivector_expr_grammar
            >::value,
            const multivector&
        >::type
        operator=(const Expr& expr) {
            static kernel_cache cache;

            const std::vector<cl::CommandQueue> &queue = vec[0]->queue_list();

            // If any device in context is CPU, then do not fuse the kernel,
            // but assign components individually.
            if (std::any_of(queue.begin(), queue.end(), [](const cl::CommandQueue &q) { return is_cpu(qdev(q)); })) {
                assign_subexpressions<0, N>(boost::proto::as_child(expr));
                return *this;
            }

            for(uint d = 0; d < queue.size(); d++) {
                cl::Context context = qctx(queue[d]);
                cl::Device  device  = qdev(queue[d]);

                auto kernel = cache.find( context() );

                if (kernel == cache.end()) {
                    std::ostringstream kernel_name;
                    kernel_name << "multi_";
                    vector_name_context name_ctx(kernel_name);
                    boost::proto::eval(boost::proto::as_child(expr), name_ctx);

                    std::ostringstream source;
                    source << standard_kernel_header(device);

                    extract_user_functions()(
                            boost::proto::as_child(expr),
                            declare_user_function(source)
                            );

                    source << "kernel void " << kernel_name.str()
                           << "(\n\t" << type_name<size_t>() << " n";

                    for(size_t i = 0; i < N; )
                        source << ",\n\tglobal " << type_name<T>()
                               << " *res_" << ++i;

                    build_param_list<N>(boost::proto::as_child(expr), source);

                    source <<
                        "\n)\n{\n"
                        "\tfor(size_t idx = get_global_id(0); idx < n; idx += get_global_size(0)) {\n";

                    build_expr_list(boost::proto::as_child(expr), source);

                    source << "\t}\n}\n";

                    auto program = build_sources(context, source.str());

                    cl::Kernel krn(program, kernel_name.str().c_str());
                    size_t wgs = kernel_workgroup_size(krn, device);

                    kernel = cache.insert(std::make_pair(
                                context(), kernel_cache_entry(krn, wgs)
                                )).first;
                }

                if (size_t psize = vec[0]->part_size(d)) {
                    size_t w_size = kernel->second.wgsize;
                    size_t g_size = num_workgroups(device) * w_size;

                    uint pos = 0;
                    kernel->second.kernel.setArg(pos++, psize);

                    for(uint i = 0; i < N; i++)
                        kernel->second.kernel.setArg(pos++, vec[i]->operator()(d));

                    set_kernel_args<N>(
                            boost::proto::as_child(expr),
                            kernel->second.kernel, d, pos, vec[0]->part_start(d)
                            );

                    queue[d].enqueueNDRangeKernel(
                            kernel->second.kernel, cl::NullRange, g_size, w_size
                            );
                }
            }

            return *this;
        }