Ejemplo n.º 1
0
/*  This function returns the number of variable MTRRs  */
static void __init set_num_var_ranges(void)
{
	unsigned long config = 0, dummy;

	if (use_intel())
		rdmsr(MSR_MTRRcap, config, dummy);
	else if (is_cpu(AMD))
		config = 2;
	else if (is_cpu(CYRIX) || is_cpu(CENTAUR))
		config = 8;

	num_var_ranges = config & 0xff;
}
Ejemplo n.º 2
0
/*  Put the processor into a state where MTRRs can be safely set  */
void set_mtrr_prepare_save(struct set_mtrr_context *ctxt)
{
    unsigned int cr0;

    /*  Disable interrupts locally  */
    local_irq_save(ctxt->flags);

    if (use_intel() || is_cpu(CYRIX)) {

        /*  Save value of CR4 and clear Page Global Enable (bit 7)  */
        if ( cpu_has_pge ) {
            ctxt->cr4val = read_cr4();
            write_cr4(ctxt->cr4val & ~X86_CR4_PGE);
        }

        /*  Disable and flush caches. Note that wbinvd flushes the TLBs as
            a side-effect  */
        cr0 = read_cr0() | 0x40000000;
        wbinvd();
        write_cr0(cr0);
        wbinvd();

        if (use_intel())
            /*  Save MTRR state */
            rdmsr(MTRRdefType_MSR, ctxt->deftype_lo, ctxt->deftype_hi);
        else
            /* Cyrix ARRs - everything else were excluded at the top */
            ctxt->ccr3 = getCx86(CX86_CCR3);
    }
}
Ejemplo n.º 3
0
/*  Restore the processor after a set_mtrr_prepare  */
void set_mtrr_done(struct set_mtrr_context *ctxt)
{
    if (use_intel() || is_cpu(CYRIX)) {

        /*  Flush caches and TLBs  */
        wbinvd();

        /*  Restore MTRRdefType  */
        if (use_intel())
            /* Intel (P6) standard MTRRs */
            mtrr_wrmsr(MTRRdefType_MSR, ctxt->deftype_lo, ctxt->deftype_hi);
        else
            /* Cyrix ARRs - everything else was excluded at the top */
            setCx86(CX86_CCR3, ctxt->ccr3);

        /*  Enable caches  */
        write_cr0(read_cr0() & 0xbfffffff);

        /*  Restore value of CR4  */
        if ( cpu_has_pge )
            write_cr4(ctxt->cr4val);
    }
    /*  Re-enable interrupts locally (if enabled previously)  */
    local_irq_restore(ctxt->flags);
}
Ejemplo n.º 4
0
int mtrr_del_page(int reg, unsigned long base, unsigned long size)
{
	int i, max;
	mtrr_type ltype;
	unsigned long lbase;
	unsigned int lsize;
	int error = -EINVAL;

	if (!mtrr_if)
		return -ENXIO;

	max = num_var_ranges;
	/* No CPU hotplug when we change MTRR entries */
	lock_cpu_hotplug();
	mutex_lock(&mtrr_mutex);
	if (reg < 0) {
		/*  Search for existing MTRR  */
		for (i = 0; i < max; ++i) {
			mtrr_if->get(i, &lbase, &lsize, &ltype);
			if (lbase == base && lsize == size) {
				reg = i;
				break;
			}
		}
		if (reg < 0) {
			printk(KERN_DEBUG "mtrr: no MTRR for %lx000,%lx000 found\n", base,
			       size);
			goto out;
		}
	}
	if (reg >= max) {
		printk(KERN_WARNING "mtrr: register: %d too big\n", reg);
		goto out;
	}
	if (is_cpu(CYRIX) && !use_intel()) {
		if ((reg == 3) && arr3_protected) {
			printk(KERN_WARNING "mtrr: ARR3 cannot be changed\n");
			goto out;
		}
	}
	mtrr_if->get(reg, &lbase, &lsize, &ltype);
	if (lsize < 1) {
		printk(KERN_WARNING "mtrr: MTRR %d not used\n", reg);
		goto out;
	}
	if (usage_table[reg] < 1) {
		printk(KERN_WARNING "mtrr: reg: %d has count=0\n", reg);
		goto out;
	}
	if (--usage_table[reg] < 1)
		set_mtrr(reg, 0, 0, 0);
	error = reg;
 out:
	mutex_unlock(&mtrr_mutex);
	unlock_cpu_hotplug();
	return error;
}
Ejemplo n.º 5
0
 static void partial_vector_expr(const Vector &x, backend::source_generator &src,
     const backend::command_queue &q, const std::string &prm_name,
     detail::kernel_generator_state_ptr state)
 {
     if (is_cpu(q)) {
         Csr::partial_vector_expr(x, src, q, prm_name, state);
     } else {
         Ell::partial_vector_expr(x, src, q, prm_name, state);
     }
 }
Ejemplo n.º 6
0
void set_mtrr_cache_disable(struct set_mtrr_context *ctxt)
{
    if (use_intel())
        /*  Disable MTRRs, and set the default type to uncached  */
        mtrr_wrmsr(MTRRdefType_MSR, ctxt->deftype_lo & 0xf300UL,
                   ctxt->deftype_hi);
    else if (is_cpu(CYRIX))
        /* Cyrix ARRs - everything else were excluded at the top */
        setCx86(CX86_CCR3, (ctxt->ccr3 & 0x0f) | 0x10);
}
Ejemplo n.º 7
0
 static void local_terminal_init(const Vector &x, backend::source_generator &src,
     const backend::command_queue &q, const std::string &prm_name,
     detail::kernel_generator_state_ptr state)
 {
     if (is_cpu(q)) {
         Csr::local_terminal_init(x, src, q, prm_name, state);
     } else {
         Ell::local_terminal_init(x, src, q, prm_name, state);
     }
 }
Ejemplo n.º 8
0
int reduce_by_key_sink(
        IKTuple &&ikeys, vector<V> const &ivals,
        OKTuple &&okeys, vector<V>       &ovals,
        Comp, Oper
        )
{
    namespace fusion = boost::fusion;
    typedef typename extract_value_types<IKTuple>::type K;

    static_assert(
            std::is_same<K, typename extract_value_types<OKTuple>::type>::value,
            "Incompatible input and output key types");

    precondition(
            fusion::at_c<0>(ikeys).nparts() == 1 && ivals.nparts() == 1,
            "reduce_by_key is only supported for single device contexts"
            );

    precondition(fusion::at_c<0>(ikeys).size() == ivals.size(),
            "keys and values should have same size"
            );

    const auto &queue = fusion::at_c<0>(ikeys).queue_list();
    backend::select_context(queue[0]);

    const int NT_cpu = 1;
    const int NT_gpu = 256;
    const int NT = is_cpu(queue[0]) ? NT_cpu : NT_gpu;

    size_t count         = fusion::at_c<0>(ikeys).size();
    size_t num_blocks    = (count + NT - 1) / NT;
    size_t scan_buf_size = alignup(num_blocks, NT);

    backend::device_vector<int> key_sum   (queue[0], scan_buf_size);
    backend::device_vector<V>   pre_sum   (queue[0], scan_buf_size);
    backend::device_vector<V>   post_sum  (queue[0], scan_buf_size);
    backend::device_vector<V>   offset_val(queue[0], count);
    backend::device_vector<int> offset    (queue[0], count);

    /***** Kernel 0 *****/
    auto krn0 = offset_calculation<K, Comp>(queue[0]);

    krn0.push_arg(count);
    boost::fusion::for_each(ikeys, do_push_arg(krn0));
    krn0.push_arg(offset);

    krn0(queue[0]);

    VEX_FUNCTION(int, plus, (int, x)(int, y), return x + y;);
Ejemplo n.º 9
0
 matrix(
         const std::vector<backend::command_queue> &q,
         size_t nrows, size_t ncols,
         const PtrRange &ptr,
         const ColRange &col,
         const ValRange &val,
         bool fast_setup = true
    ) : q(q[0])
 {
     if (is_cpu(q[0])) {
         Acpu = std::make_shared<Csr>(q, nrows, ncols, ptr, col, val);
     } else {
         Agpu = std::make_shared<Ell>(q, nrows, ncols, ptr, col, val, fast_setup);
     }
 }
Ejemplo n.º 10
0
        /// Select best launch configuration for the given shared memory requirements.
        void config(const cl::CommandQueue &queue, std::function<size_t(size_t)> smem) {
            cl::Device dev = queue.getInfo<CL_QUEUE_DEVICE>();

            if ( is_cpu(queue) ) {
                w_size = 1;
            } else {
                // Select workgroup size that would fit into the device.
                w_size = dev.getInfo<CL_DEVICE_MAX_WORK_ITEM_SIZES>()[0] / 2;

                size_t max_ws   = max_threads_per_block(queue);
                size_t max_smem = max_shared_memory_per_block(queue);

                // Reduce workgroup size until it satisfies resource requirements:
                while( (w_size > max_ws) || (smem(w_size) > max_smem) )
                    w_size /= 2;
            }

            g_size = w_size * num_workgroups(queue);
        }
Ejemplo n.º 11
0
 void kernel_arg_setter(const Vector &x,
     backend::kernel &kernel, unsigned part, size_t index_offset,
     detail::kernel_generator_state_ptr state) const
 {
     if (is_cpu(q)) {
         if (Acpu) {
             Acpu->kernel_arg_setter(x, kernel, part, index_offset, state);
         } else {
             Csr dummy_A(q);
             dummy_A.kernel_arg_setter(x, kernel, part, index_offset, state);
         }
     } else {
         if (Agpu) {
             Agpu->kernel_arg_setter(x, kernel, part, index_offset, state);
         } else {
             Ell dummy_A(q);
             dummy_A.kernel_arg_setter(x, kernel, part, index_offset, state);
         }
     }
 }
Ejemplo n.º 12
0
        /// Select best launch configuration for the given shared memory requirements.
        void config(const boost::compute::command_queue &queue, std::function<size_t(size_t)> smem) {
            boost::compute::device dev = queue.get_device();

            size_t ws;

            if ( is_cpu(queue) ) {
                ws = 1;
            } else {
                // Select workgroup size that would fit into the device.
                ws = dev.get_info<std::vector<size_t>>(CL_DEVICE_MAX_WORK_ITEM_SIZES)[0] / 2;

                size_t max_ws   = max_threads_per_block(queue);
                size_t max_smem = max_shared_memory_per_block(queue);

                // Reduce workgroup size until it satisfies resource requirements:
                while( (ws > max_ws) || (smem(ws) > max_smem) )
                    ws /= 2;
            }

            config(num_workgroups(queue), ws);
        }
Ejemplo n.º 13
0
    backend::device_vector<int> offset    (queue[0], count);

    /***** Kernel 0 *****/
    auto krn0 = offset_calculation<K, Comp>(queue[0]);

    krn0.push_arg(count);
    boost::fusion::for_each(ikeys, do_push_arg(krn0));
    krn0.push_arg(offset);

    krn0(queue[0]);

    VEX_FUNCTION(int, plus, (int, x)(int, y), return x + y;);
    scan(queue[0], offset, offset, 0, false, plus);

    /***** Kernel 1 *****/
    auto krn1 = is_cpu(queue[0]) ?
        block_scan_by_key<NT_cpu, V, Oper>(queue[0]) :
        block_scan_by_key<NT_gpu, V, Oper>(queue[0]);

    krn1.push_arg(count);
    krn1.push_arg(offset);
    krn1.push_arg(ivals(0));
    krn1.push_arg(offset_val);
    krn1.push_arg(key_sum);
    krn1.push_arg(pre_sum);

    krn1.config(num_blocks, NT);
    krn1(queue[0]);

    /***** Kernel 2 *****/
    uint work_per_thread = std::max<uint>(1U, static_cast<uint>(scan_buf_size / NT));
Ejemplo n.º 14
0
inline kernel_call transpose_kernel(
        const backend::command_queue &queue, size_t width, size_t height,
        const backend::device_vector<T2> &in,
        const backend::device_vector<T2> &out
        )
{
    scoped_program_header header(queue, fft_kernel_header<T>());

    backend::source_generator o(queue);
    o << std::setprecision(25);

    // determine max block size to fit into local memory/workgroup
    size_t block_size = is_cpu(queue) ? 1 : 128;
    {
#if defined(VEXCL_BACKEND_OPENCL) || defined(VEXCL_BACKEND_COMPUTE)
        cl_device_id dev = backend::get_device_id(queue);
        cl_ulong local_size;
        size_t workgroup;
        clGetDeviceInfo(dev, CL_DEVICE_LOCAL_MEM_SIZE, sizeof(cl_ulong), &local_size, NULL);
        clGetDeviceInfo(dev, CL_DEVICE_MAX_WORK_GROUP_SIZE, sizeof(size_t), &workgroup, NULL);
#else
        const auto local_size = queue.device().max_shared_memory_per_block();
        const auto workgroup = queue.device().max_threads_per_block();
#endif
        while(block_size * block_size * sizeof(T) * 2 > local_size) block_size /= 2;
        while(block_size * block_size > workgroup) block_size /= 2;
    }

    // from NVIDIA SDK.
    o.begin_kernel("transpose");
    o.begin_kernel_parameters();
    o.template parameter< global_ptr<const T2> >("input");
    o.template parameter< global_ptr<      T2> >("output");
    o.template parameter< cl_uint              >("width");
    o.template parameter< cl_uint              >("height");
    o.end_kernel_parameters();

    o.new_line() << "const size_t global_x = " << o.global_id(0) << ";";
    o.new_line() << "const size_t global_y = " << o.global_id(1) << ";";
    o.new_line() << "const size_t local_x  = " << o.local_id(0)  << ";";
    o.new_line() << "const size_t local_y  = " << o.local_id(1)  << ";";
    o.new_line() << "const bool range = global_x < width && global_y < height;";

    // local memory
    {
        std::ostringstream s;
        s << "block[" << block_size * block_size << "]";
        o.smem_static_var(type_name<T2>(), s.str());
    }

    // copy from input to local memory
    o.new_line() << "if(range) "
        << "block[local_x + local_y * " << block_size << "] = input[global_x + global_y * width];";

    // wait until the whole block is filled
    o.new_line().barrier();

    // transpose local block to target
    o.new_line() << "if(range) "
      << "output[global_x * height + global_y] = block[local_x + local_y * " << block_size << "];";

    o.end_kernel();

    backend::kernel kernel(queue, o.str(), "transpose");

    kernel.push_arg(in);
    kernel.push_arg(out);
    kernel.push_arg(static_cast<cl_uint>(width));
    kernel.push_arg(static_cast<cl_uint>(height));

    // range multiple of wg size, last block maybe not completely filled.
    size_t r_w = (width  + block_size - 1) / block_size;
    size_t r_h = (height + block_size - 1) / block_size;

    kernel.config(backend::ndrange(r_w, r_h), backend::ndrange(block_size, block_size));

    std::ostringstream desc;
    desc << "transpose{"
         << "w=" << width << "(" << r_w << "), "
         << "h=" << height << "(" << r_h << "), "
         << "bs=" << block_size << "}";

    return kernel_call(false, desc.str(), kernel);
}
Ejemplo n.º 15
0
        typename std::enable_if<
            boost::proto::matches<
                typename boost::proto::result_of::as_expr<Expr>::type,
                multivector_expr_grammar
            >::value,
            const multivector&
        >::type
        operator=(const Expr& expr) {
            static kernel_cache cache;

            const std::vector<cl::CommandQueue> &queue = vec[0]->queue_list();

            // If any device in context is CPU, then do not fuse the kernel,
            // but assign components individually.
            if (std::any_of(queue.begin(), queue.end(), [](const cl::CommandQueue &q) { return is_cpu(qdev(q)); })) {
                assign_subexpressions<0, N>(boost::proto::as_child(expr));
                return *this;
            }

            for(uint d = 0; d < queue.size(); d++) {
                cl::Context context = qctx(queue[d]);
                cl::Device  device  = qdev(queue[d]);

                auto kernel = cache.find( context() );

                if (kernel == cache.end()) {
                    std::ostringstream kernel_name;
                    kernel_name << "multi_";
                    vector_name_context name_ctx(kernel_name);
                    boost::proto::eval(boost::proto::as_child(expr), name_ctx);

                    std::ostringstream source;
                    source << standard_kernel_header(device);

                    extract_user_functions()(
                            boost::proto::as_child(expr),
                            declare_user_function(source)
                            );

                    source << "kernel void " << kernel_name.str()
                           << "(\n\t" << type_name<size_t>() << " n";

                    for(size_t i = 0; i < N; )
                        source << ",\n\tglobal " << type_name<T>()
                               << " *res_" << ++i;

                    build_param_list<N>(boost::proto::as_child(expr), source);

                    source <<
                        "\n)\n{\n"
                        "\tfor(size_t idx = get_global_id(0); idx < n; idx += get_global_size(0)) {\n";

                    build_expr_list(boost::proto::as_child(expr), source);

                    source << "\t}\n}\n";

                    auto program = build_sources(context, source.str());

                    cl::Kernel krn(program, kernel_name.str().c_str());
                    size_t wgs = kernel_workgroup_size(krn, device);

                    kernel = cache.insert(std::make_pair(
                                context(), kernel_cache_entry(krn, wgs)
                                )).first;
                }

                if (size_t psize = vec[0]->part_size(d)) {
                    size_t w_size = kernel->second.wgsize;
                    size_t g_size = num_workgroups(device) * w_size;

                    uint pos = 0;
                    kernel->second.kernel.setArg(pos++, psize);

                    for(uint i = 0; i < N; i++)
                        kernel->second.kernel.setArg(pos++, vec[i]->operator()(d));

                    set_kernel_args<N>(
                            boost::proto::as_child(expr),
                            kernel->second.kernel, d, pos, vec[0]->part_start(d)
                            );

                    queue[d].enqueueNDRangeKernel(
                            kernel->second.kernel, cl::NullRange, g_size, w_size
                            );
                }
            }

            return *this;
        }
Ejemplo n.º 16
0
        typename std::enable_if<
            !boost::proto::matches<
                typename boost::proto::result_of::as_expr<ExprTuple>::type,
                multivector_full_grammar
            >::value
#if !defined(_MSC_VER) || _MSC_VER >= 1700
            && N == std::tuple_size<ExprTuple>::value
#endif
            , const multivector&
        >::type
        operator=(const ExprTuple &expr) {
#endif
            static kernel_cache cache;

            const std::vector<cl::CommandQueue> &queue = vec[0]->queue_list();

            for(uint d = 0; d < queue.size(); d++) {
                cl::Context context = qctx(queue[d]);
                cl::Device  device  = qdev(queue[d]);

                auto kernel = cache.find( context() );

                if (kernel == cache.end()) {
                    std::ostringstream source;

                    source << standard_kernel_header(device);

                    {
                        get_header f(source);
                        for_each<0>(expr, f);
                    }

                    source <<
                        "kernel void multi_expr_tuple(\n"
                        "\t" << type_name<size_t>() << " n";

                    for(uint i = 1; i <= N; i++)
                        source << ",\n\tglobal " << type_name<T>() << " *res_" << i;

                    {
                        get_params f(source);
                        for_each<0>(expr, f);
                    }

                    source << "\n)\n{\n";

                    if ( is_cpu(device) ) {
                        source <<
                            "\tsize_t chunk_size  = (n + get_global_size(0) - 1) / get_global_size(0);\n"
                            "\tsize_t chunk_start = get_global_id(0) * chunk_size;\n"
                            "\tsize_t chunk_end   = min(n, chunk_start + chunk_size);\n"
                            "\tfor(size_t idx = chunk_start; idx < chunk_end; ++idx) {\n";
                    } else {
                        source <<
                            "\tfor(size_t idx = get_global_id(0); idx < n; idx += get_global_size(0)) {\n";
                    }

                    {
                        get_expressions f(source);
                        for_each<0>(expr, f);
                    }

                    source << "\n";

                    for(uint i = 1; i <= N; i++)
                        source << "\t\tres_" << i << "[idx] = buf_" << i << ";\n";

                    source << "\t}\n}\n";

                    auto program = build_sources(context, source.str());

                    cl::Kernel krn(program, "multi_expr_tuple");
                    size_t wgs = kernel_workgroup_size(krn, device);

                    kernel = cache.insert(std::make_pair(
                                context(), kernel_cache_entry(krn, wgs)
                                )).first;
                }

                if (size_t psize = vec[0]->part_size(d)) {
                    size_t w_size = kernel->second.wgsize;
                    size_t g_size = num_workgroups(device) * w_size;

                    uint pos = 0;
                    kernel->second.kernel.setArg(pos++, psize);

                    for(uint i = 0; i < N; i++)
                        kernel->second.kernel.setArg(pos++, (*vec[i])(d));

                    {
                        set_arguments f(kernel->second.kernel, d, pos, vec[0]->part_start(d));
                        for_each<0>(expr, f);
                    }

                    queue[d].enqueueNDRangeKernel(
                            kernel->second.kernel, cl::NullRange, g_size, w_size
                            );
                }
            }

            return *this;
        }
Ejemplo n.º 17
0
void scan(
        backend::command_queue    const &queue,
        backend::device_vector<T> const &input,
        backend::device_vector<T>       &output,
        T init,
        bool exclusive,
        Oper
        )
{
    precondition(
            input.size() == output.size(),
            "Wrong output size in inclusive_scan"
            );

    backend::select_context(queue);

    const int NT_cpu = 1;
    const int NT_gpu = 256;
    const int NT = is_cpu(queue) ? NT_cpu : NT_gpu;
    const int NT2 = 2 * NT;

    int do_exclusive = exclusive ? 1 : 0;

    const size_t count         = input.size();
    const size_t num_blocks    = (count + NT2 - 1) / NT2;
    const size_t scan_buf_size = alignup(num_blocks, NT2);

    backend::device_vector<T> pre_sum1(queue, scan_buf_size);
    backend::device_vector<T> pre_sum2(queue, scan_buf_size);
    backend::device_vector<T> post_sum(queue, scan_buf_size);

    // Kernel0
    auto krn0 = is_cpu(queue) ?
        block_inclusive_scan<NT_cpu, T, Oper>(queue) :
        block_inclusive_scan<NT_gpu, T, Oper>(queue);

    krn0.push_arg(count);
    krn0.push_arg(input);
    krn0.push_arg(init);
    krn0.push_arg(pre_sum1);
    krn0.push_arg(pre_sum2);
    krn0.push_arg(do_exclusive);

    krn0.config(num_blocks, NT);

    krn0(queue);

    // Kernel1
    auto krn1 = is_cpu(queue) ?
        intra_block_inclusive_scan<NT_cpu, T, Oper>(queue) :
        intra_block_inclusive_scan<NT_gpu, T, Oper>(queue);

    uint work_per_thread = std::max<uint>(1U, static_cast<uint>(scan_buf_size / NT));
    krn1.push_arg(num_blocks);
    krn1.push_arg(post_sum);
    krn1.push_arg(pre_sum1);
    krn1.push_arg(init);
    krn1.push_arg(work_per_thread);

    krn1.config(1, NT);

    krn1(queue);

    // Kernel2
    auto krn2 = is_cpu(queue) ?
        block_addition<NT_cpu, T, Oper>(queue) :
        block_addition<NT_gpu, T, Oper>(queue);

    krn2.push_arg(count);
    krn2.push_arg(input);
    krn2.push_arg(output);
    krn2.push_arg(post_sum);
    krn2.push_arg(pre_sum2);
    krn2.push_arg(init);
    krn2.push_arg(do_exclusive);

    krn2.config(num_blocks * 2, NT);

    krn2(queue);
}
Ejemplo n.º 18
0
int reduce_by_key_sink(
        IKTuple &&ikeys, vector<V> const &ivals,
        OKTuple &&okeys, vector<V>       &ovals,
        Comp, Oper
        )
{
    namespace fusion = boost::fusion;
    typedef typename extract_value_types<IKTuple>::type K;

    static_assert(
            std::is_same<K, typename extract_value_types<OKTuple>::type>::value,
            "Incompatible input and output key types");

    precondition(
            fusion::at_c<0>(ikeys).nparts() == 1 && ivals.nparts() == 1,
            "Sorting is only supported for single device contexts"
            );

    precondition(fusion::at_c<0>(ikeys).size() == ivals.size(),
            "keys and values should have same size"
            );

    const auto &queue = fusion::at_c<0>(ikeys).queue_list();
    backend::select_context(queue[0]);

    const int NT_cpu = 1;
    const int NT_gpu = 256;
    const int NT = is_cpu(queue[0]) ? NT_cpu : NT_gpu;

    size_t count         = fusion::at_c<0>(ikeys).size();
    size_t num_blocks    = (count + NT - 1) / NT;
    size_t scan_buf_size = alignup(num_blocks, NT);

    backend::device_vector<int> key_sum   (queue[0], scan_buf_size);
    backend::device_vector<V>   pre_sum   (queue[0], scan_buf_size);
    backend::device_vector<V>   post_sum  (queue[0], scan_buf_size);
    backend::device_vector<V>   offset_val(queue[0], count);
    backend::device_vector<int> offset    (queue[0], count);

    /***** Kernel 0 *****/
    auto krn0 = detail::offset_calculation<K, Comp>(queue[0]);

    krn0.push_arg(count);
    boost::fusion::for_each(ikeys, do_push_arg(krn0));
    krn0.push_arg(offset);

    krn0(queue[0]);

    VEX_FUNCTION(plus, int(int, int), "return prm1 + prm2;");
    detail::scan(queue[0], offset, offset, 0, false, plus);

    /***** Kernel 1 *****/
    auto krn1 = is_cpu(queue[0]) ?
        detail::block_scan_by_key<NT_cpu, V, Oper>(queue[0]) :
        detail::block_scan_by_key<NT_gpu, V, Oper>(queue[0]);

    krn1.push_arg(count);
    krn1.push_arg(offset);
    krn1.push_arg(ivals(0));
    krn1.push_arg(offset_val);
    krn1.push_arg(key_sum);
    krn1.push_arg(pre_sum);

    krn1.config(num_blocks, NT);
    krn1(queue[0]);

    /***** Kernel 2 *****/
    uint work_per_thread = std::max<uint>(1U, static_cast<uint>(scan_buf_size / NT));

    auto krn2 = is_cpu(queue[0]) ?
        detail::block_inclusive_scan_by_key<NT_cpu, V, Oper>(queue[0]) :
        detail::block_inclusive_scan_by_key<NT_gpu, V, Oper>(queue[0]);

    krn2.push_arg(num_blocks);
    krn2.push_arg(key_sum);
    krn2.push_arg(pre_sum);
    krn2.push_arg(post_sum);
    krn2.push_arg(work_per_thread);

    krn2.config(1, NT);
    krn2(queue[0]);

    /***** Kernel 3 *****/
    auto krn3 = detail::block_sum_by_key<V, Oper>(queue[0]);

    krn3.push_arg(count);
    krn3.push_arg(key_sum);
    krn3.push_arg(post_sum);
    krn3.push_arg(offset);
    krn3.push_arg(offset_val);

    krn3.config(num_blocks, NT);
    krn3(queue[0]);

    /***** resize okeys and ovals *****/
    int out_elements;
    offset.read(queue[0], count - 1, 1, &out_elements, true);
    ++out_elements;

    boost::fusion::for_each(okeys, do_vex_resize(queue, out_elements));
    ovals.resize(ivals.queue_list(), out_elements);

    /***** Kernel 4 *****/
    auto krn4 = detail::key_value_mapping<K, V>(queue[0]);

    krn4.push_arg(count);
    boost::fusion::for_each(ikeys, do_push_arg(krn4));
    boost::fusion::for_each(okeys, do_push_arg(krn4));
    krn4.push_arg(ovals(0));
    krn4.push_arg(offset);
    krn4.push_arg(offset_val);

    krn4(queue[0]);

    return out_elements;
}
Ejemplo n.º 19
0
 source_generator(const command_queue &queue)
     : indent(0), first_prm(true), cpu( is_cpu(queue) )
 {
     src << standard_kernel_header(queue);
 }