static void get(backend::source_generator &src, const ccsr_product<val_t, col_t, idx_t, T>&, const backend::command_queue&, const std::string &prm_name, detail::kernel_generator_state_ptr) { typedef decltype(val_t() * T()) res_t; src.function<res_t>(prm_name + "_spmv") .open("(") .template parameter< global_ptr<const idx_t> >("idx") .template parameter< global_ptr<const idx_t> >("row") .template parameter< global_ptr<const col_t> >("col") .template parameter< global_ptr<const val_t> >("val") .template parameter< global_ptr<const T> >("vec") .template parameter< size_t >("i") .close(")").open("{"); src.new_line() << type_name<res_t>() << " sum = 0;"; src.new_line() << "for(size_t pos = idx[i], j = row[pos], end = row[pos+1]; j < end; ++j)"; src.open("{"); src.new_line() << "sum += val[j] * vec[i + col[j]];"; src.close("}"); src.new_line() << "return sum;"; src.close("}"); }
static void local_terminal_init(const Vector &x, backend::source_generator &src, const backend::command_queue &q, const std::string &prm_name, detail::kernel_generator_state_ptr state) { typedef typename detail::return_type<Vector>::type x_type; typedef spmv_ops_impl<Val, x_type> spmv_ops; spmv_ops::decl_accum_var(src, prm_name + "_sum"); src.open("{"); // ELL part src.new_line() << "for(size_t j = 0; j < " << prm_name << "_ell_width; ++j)"; src.open("{"); src.new_line() << type_name<Col>() << " nnz_idx = idx + j * " << prm_name << "_ell_pitch;"; src.new_line() << type_name<Col>() << " c = " << prm_name << "_ell_col[nnz_idx];"; src.new_line() << "if (c != (" << type_name<Col>() << ")(-1))"; src.open("{"); src.new_line() << type_name<Col>() << " idx = c;"; { detail::output_local_preamble init_x(src, q, prm_name + "_x", state); boost::proto::eval(boost::proto::as_child(x), init_x); backend::source_generator vec_value; detail::vector_expr_context expr_x(vec_value, q, prm_name + "_x", state); boost::proto::eval(boost::proto::as_child(x), expr_x); spmv_ops::append_product(src, prm_name + "_sum", prm_name + "_ell_val[nnz_idx]", vec_value.str()); } src.close("} else break;"); src.close("}"); // CSR part src.new_line() << "if (" << prm_name << "_csr_ptr)"; src.open("{"); src.new_line() << type_name<Ptr>() << " csr_beg = " << prm_name << "_csr_ptr[idx];"; src.new_line() << type_name<Ptr>() << " csr_end = " << prm_name << "_csr_ptr[idx+1];"; src.new_line() << "for(" << type_name<Ptr>() << " j = csr_beg; j < csr_end; ++j)"; src.open("{"); src.new_line() << type_name<Col>() << " idx = " << prm_name << "_csr_col[j];"; { detail::output_local_preamble init_x(src, q, prm_name + "_x", state); boost::proto::eval(boost::proto::as_child(x), init_x); backend::source_generator vec_value; detail::vector_expr_context expr_x(vec_value, q, prm_name + "_x", state); boost::proto::eval(boost::proto::as_child(x), expr_x); spmv_ops::append_product(src, prm_name + "_sum", prm_name + "_csr_val[j]", vec_value.str()); } src.close("}"); src.close("}"); src.close("}"); }
static void get(backend::source_generator &src, const temporary<T, Tag, Expr> &term, const backend::command_queue &queue, const std::string &prm_name, detail::kernel_generator_state_ptr state) { auto s = state->find("tmp_locinit"); if (s == state->end()) { s = state->insert(std::make_pair( std::string("tmp_locinit"), boost::any(std::set<size_t>()) )).first; } auto &pos = boost::any_cast< std::set<size_t>& >(s->second); auto p = pos.find(Tag); if (p == pos.end()) { pos.insert(Tag); detail::output_local_preamble init_ctx(src, queue, prm_name, state); boost::proto::eval(boost::proto::as_child(term.expr), init_ctx); src.new_line() << type_name<T>() << " temp_" << Tag << " = "; detail::vector_expr_context expr_ctx(src, queue, prm_name, state); boost::proto::eval(boost::proto::as_child(term.expr), expr_ctx); src << ";"; } }
inline void twiddle_code(backend::source_generator &o) { o.function<T2>("twiddle").open("(") .template parameter<T>("alpha") .close(")").open("{"); if(std::is_same<T, cl_double>::value) { // use sincos with double since we probably want higher precision #ifndef VEXCL_BACKEND_CUDA o.new_line() << type_name<T>() << " cs, sn = sincos(alpha, &cs);"; #else o.new_line() << type_name<T>() << " sn, cs;"; o.new_line() << "sincos(alpha, &sn, &cs);"; #endif o.new_line() << type_name<T2>() << " r = {cs, sn};"; } else { // use native with float since we probably want higher performance #ifndef VEXCL_BACKEND_CUDA o.new_line() << type_name<T2>() << " r = {" "native_cos(alpha), native_sin(alpha)};"; #else o.new_line() << type_name<T>() << " sn, cs;"; o.new_line() << "__sincosf(alpha, &sn, &cs);"; o.new_line() << type_name<T2>() << " r = {cs, sn};"; #endif } o.new_line() << "return r;"; o.close("}"); }
inline void mul_code(backend::source_generator &o, bool invert) { o.function<T2>("mul").open("(") .template parameter<T2>("a") .template parameter<T2>("b") .close(")").open("{"); if(invert) { // conjugate b o.new_line() << type_name<T2>() << " r = {" "a.x * b.x + a.y * b.y, " "a.y * b.x - a.x * b.y};"; } else { o.new_line() << type_name<T2>() << " r = {" "a.x * b.x - a.y * b.y, " "a.y * b.x + a.x * b.y};"; } o.new_line() << "return r;"; o.close("}"); }
inline void mul_code(backend::source_generator &o, bool invert) { o.begin_function<T2>("mul"); o.begin_function_parameters(); o.template parameter<T2>("a"); o.template parameter<T2>("b"); o.end_function_parameters(); if(invert) { // conjugate b o.new_line() << type_name<T2>() << " r = {" "a.x * b.x + a.y * b.y, " "a.y * b.x - a.x * b.y};"; } else { o.new_line() << type_name<T2>() << " r = {" "a.x * b.x - a.y * b.y, " "a.y * b.x + a.x * b.y};"; } o.new_line() << "return r;"; o.end_function(); }
inline void kernel_radix(backend::source_generator &o, pow radix, bool invert) { o << in_place_dft(radix.value, invert); // kernel. o.begin_kernel("radix"); o.begin_kernel_parameters(); o.template parameter< global_ptr<const T2> >("x"); o.template parameter< global_ptr< T2> >("y"); o.template parameter< cl_uint >("p"); o.template parameter< cl_uint >("threads"); o.end_kernel_parameters(); o.new_line() << "const size_t i = " << o.global_id(0) << ";"; o.new_line() << "if(i >= threads) return;"; // index in input sequence, in 0..P-1 o.new_line() << "const size_t k = i % p;"; o.new_line() << "const size_t batch_offset = " << o.global_id(1) << " * threads * " << radix.value << ";"; // read o.new_line() << "x += i + batch_offset;"; for(size_t i = 0; i < radix.value; ++i) o.new_line() << type_name<T2>() << " v" << i << " = x[" << i << " * threads];"; // twiddle o.new_line() << "if(p != 1)"; o.open("{"); for(size_t i = 1; i < radix.value; ++i) { const T alpha = -boost::math::constants::two_pi<T>() * i / radix.value; o.new_line() << "v" << i << " = mul(v" << i << ", twiddle(" << "(" << type_name<T>() << ")" << std::setprecision(16) << alpha << " * k / p));"; } o.close("}"); // inplace DFT o.new_line() << "dft" << radix.value; param_list(o, "&", 0, radix.value); o << ";"; // write back o.new_line() << "const size_t j = k + (i - k) * " << radix.value << ";"; o.new_line() << "y += j + batch_offset;"; for(size_t i = 0; i < radix.value; i++) o.new_line() << "y[" << i << " * p] = v" << i << ";"; o.end_kernel(); }
static void local_terminal_init(const Vector &x, backend::source_generator &src, const backend::command_queue &q, const std::string &prm_name, detail::kernel_generator_state_ptr state) { typedef typename detail::return_type<Vector>::type x_type; typedef decltype(std::declval<Val>() * std::declval<x_type>()) res_type; src.new_line() << type_name<res_type>() << " " << prm_name << "_sum = " << res_type() << ";"; src.new_line() << "if (" << prm_name << "_ptr)"; src.open("{"); src.new_line() << type_name<Ptr>() << " row_beg = " << prm_name << "_ptr[idx];"; src.new_line() << type_name<Ptr>() << " row_end = " << prm_name << "_ptr[idx+1];"; src.new_line() << "for(" << type_name<Ptr>() << " j = row_beg; j < row_end; ++j)"; src.open("{"); src.new_line() << type_name<Col>() << " idx = " << prm_name << "_col[j];"; detail::output_local_preamble init_x(src, q, prm_name + "_x", state); boost::proto::eval(boost::proto::as_child(x), init_x); src.new_line() << prm_name << "_sum += " << prm_name << "_val[j] * "; detail::vector_expr_context expr_x(src, q, prm_name + "_x", state); boost::proto::eval(boost::proto::as_child(x), expr_x); src << ";"; src.close("}"); src.close("}"); }
inline void twiddle_code(backend::source_generator &o) { o.begin_function<T2>("twiddle"); o.begin_function_parameters(); o.template parameter<T>("alpha"); o.end_function_parameters(); if(std::is_same<T, cl_double>::value) { // use sincos with double since we probably want higher precision #if defined(VEXCL_BACKEND_OPENCL) || defined(VEXCL_BACKEND_COMPUTE) o.new_line() << type_name<T>() << " cs, sn = sincos(alpha, &cs);"; #else o.new_line() << type_name<T>() << " sn, cs;"; o.new_line() << "sincos(alpha, &sn, &cs);"; #endif o.new_line() << type_name<T2>() << " r = {cs, sn};"; } else { // use native with float since we probably want higher performance #if defined(VEXCL_BACKEND_OPENCL) || defined(VEXCL_BACKEND_COMPUTE) o.new_line() << type_name<T2>() << " r = {" "native_cos(alpha), native_sin(alpha)};"; #elif defined(VEXCL_BACKEND_CUDA) o.new_line() << type_name<T>() << " sn, cs;"; o.new_line() << "__sincosf(alpha, &sn, &cs);"; o.new_line() << type_name<T2>() << " r = {cs, sn};"; #elif defined(VEXCL_BACKEND_JIT) o.new_line() << type_name<T>() << " sn, cs;"; o.new_line() << "sincosf(alpha, &sn, &cs);"; o.new_line() << type_name<T2>() << " r = {cs, sn};"; #else # error Unsupported backend! #endif } o.new_line() << "return r;"; o.end_function(); }
static void define(backend::source_generator &src, const std::string &fname) { const size_t N = cl_vector_length<T>::value; typedef typename std::conditional< sizeof(T) < 32, cl_uint, cl_ulong >::type ctr_t; const size_t ctr_n = sizeof(T) <= 8 ? 2 : 4; typedef typename Generator::template function<ctr_t, ctr_n> generator; const size_t key_n = generator::K; generator::define(src); src.begin_function<T>(fname); src.begin_function_parameters(); src.template parameter<cl_ulong>("prm1"); src.template parameter<cl_ulong>("prm2"); src.end_function_parameters(); src.new_line() << "union "; src.open("{"); src.new_line() << type_name<ctr_t>() << " ctr[" << ctr_n << "];"; if (std::is_same<Ts, cl_float>::value) { src.new_line() << type_name<cl_uint>() << " res_i[" << N << "];"; src.new_line() << type_name<cl_float>() << " res_f[" << N << "];"; } else if (std::is_same<Ts, cl_double>::value) { src.new_line() << type_name<cl_ulong>() << " res_i[" << N << "];"; src.new_line() << type_name<cl_double>() << " res_f[" << N << "];"; } src.new_line() << type_name<T>() << " res;"; src.close("} ctr;"); src.new_line() << type_name<ctr_t>() << " key[" << key_n << "];"; for(size_t i = 0; i < ctr_n; i += 2) src.new_line() << "ctr.ctr[" << i << "] = prm1; " << "ctr.ctr[" << i + 1 << "] = prm2;"; for(size_t i = 0; i < key_n; ++i) src.new_line() << "key[" << i << "] = 0x12345678;"; src.new_line() << generator::name() << "(ctr.ctr, key);"; if(std::is_same<Ts, cl_float>::value) { for(size_t i = 0; i < N; ++i) src.new_line() << "ctr.res_f[" << i << "] = ctr.res_i[" << i << "] / " << std::numeric_limits<cl_uint>::max() << ".0f;"; } else if (std::is_same<Ts, cl_double>::value) { for(size_t i = 0; i < N; ++i) src.new_line() << "ctr.res_f[" << i << "] = ctr.res_i[" << i << "] / " << std::numeric_limits<cl_ulong>::max() << ".0;"; } src.new_line() << "return ctr.res;"; src.end_function(); }
static void define(backend::source_generator &src, const std::string &fname) { const size_t N = cl_vector_length<T>::value; const bool is_float = std::is_same<Ts, cl_float>::value; const size_t ctr_n = is_float ? 2 : 4; typedef typename Generator::template function<cl_uint, ctr_n> generator; const size_t key_n = generator::K; generator::define(src); src.begin_function<T>(fname); src.begin_function_parameters(); src.template parameter<cl_ulong>("prm1"); src.template parameter<cl_ulong>("prm2"); src.end_function_parameters(); #if defined(VEXCL_BACKEND_JIT) src.new_line() << "#define cospi(x) cos(M_PI * (x))"; #endif src.new_line() << "union "; src.open("{"); src.new_line() << type_name<cl_uint>() << " ctr[" << ctr_n << "];"; if (is_float) { src.new_line() << type_name<cl_uint>() << " res_i[2];"; } else { src.new_line() << type_name<cl_ulong>() << " res_i[2];"; } src.close("} ctr;"); src.new_line() << type_name<Ts>() << " u[2];"; src.new_line() << type_name<cl_uint>() << " key[" << key_n << "];"; for(size_t i = 0; i < ctr_n; i += 2) src.new_line() << "ctr.ctr[" << i << "] = prm1; " << "ctr.ctr[" << i + 1 << "] = prm2;"; for(size_t i = 0; i < key_n; ++i) src.new_line() << "key[" << i << "] = 0x12345678;"; if (N > 1) { src.new_line() << "union "; src.open("{"); src.new_line() << type_name<Ts>() << " z[" << N << "];"; src.new_line() << type_name<T>() << " v;"; src.close("} res;"); } for(size_t i = 0 ; i < N ; i += 2) { src.new_line() << generator::name() << "(ctr.ctr, key);"; if(is_float) { for(size_t i = 0; i < 2; ++i) src.new_line() << "u[" << i << "] = ctr.res_i[" << i << "] / " << std::numeric_limits<cl_uint>::max() << ".0f;"; } else { for(size_t i = 0; i < 2; ++i) src.new_line() << "u[" << i << "] = ctr.res_i[" << i << "] / " << std::numeric_limits<cl_ulong>::max() << ".0;"; } if(N == 1) { src.new_line() << "return sqrt(-2 * log(u[0])) * cospi(2 * u[1]);\n"; } else { src.open("{"); src.new_line() << type_name<Ts>() << " l = sqrt(-2 * log(u[0])), cs, sn;"; #if defined(VEXCL_BACKEND_CUDA) src.new_line() << "sincospi(2 * u[1], &sn, &cs);"; #else src.new_line() << "sn = sincos(" << std::setprecision(16) << boost::math::constants::two_pi<double>() << " * u[1], &cs);"; #endif src.new_line() << "res.z[" << i << "] = l * cs;"; src.new_line() << "res.z[" << i + 1 << "] = l * sn;"; src.close("}"); } } if (N > 1) src.new_line() << "return res.v;"; src.end_function(); }