/* This function returns the number of variable MTRRs */ static void __init set_num_var_ranges(void) { unsigned long config = 0, dummy; if (use_intel()) rdmsr(MSR_MTRRcap, config, dummy); else if (is_cpu(AMD)) config = 2; else if (is_cpu(CYRIX) || is_cpu(CENTAUR)) config = 8; num_var_ranges = config & 0xff; }
/* Put the processor into a state where MTRRs can be safely set */ void set_mtrr_prepare_save(struct set_mtrr_context *ctxt) { unsigned int cr0; /* Disable interrupts locally */ local_irq_save(ctxt->flags); if (use_intel() || is_cpu(CYRIX)) { /* Save value of CR4 and clear Page Global Enable (bit 7) */ if ( cpu_has_pge ) { ctxt->cr4val = read_cr4(); write_cr4(ctxt->cr4val & ~X86_CR4_PGE); } /* Disable and flush caches. Note that wbinvd flushes the TLBs as a side-effect */ cr0 = read_cr0() | 0x40000000; wbinvd(); write_cr0(cr0); wbinvd(); if (use_intel()) /* Save MTRR state */ rdmsr(MTRRdefType_MSR, ctxt->deftype_lo, ctxt->deftype_hi); else /* Cyrix ARRs - everything else were excluded at the top */ ctxt->ccr3 = getCx86(CX86_CCR3); } }
/* Restore the processor after a set_mtrr_prepare */ void set_mtrr_done(struct set_mtrr_context *ctxt) { if (use_intel() || is_cpu(CYRIX)) { /* Flush caches and TLBs */ wbinvd(); /* Restore MTRRdefType */ if (use_intel()) /* Intel (P6) standard MTRRs */ mtrr_wrmsr(MTRRdefType_MSR, ctxt->deftype_lo, ctxt->deftype_hi); else /* Cyrix ARRs - everything else was excluded at the top */ setCx86(CX86_CCR3, ctxt->ccr3); /* Enable caches */ write_cr0(read_cr0() & 0xbfffffff); /* Restore value of CR4 */ if ( cpu_has_pge ) write_cr4(ctxt->cr4val); } /* Re-enable interrupts locally (if enabled previously) */ local_irq_restore(ctxt->flags); }
int mtrr_del_page(int reg, unsigned long base, unsigned long size) { int i, max; mtrr_type ltype; unsigned long lbase; unsigned int lsize; int error = -EINVAL; if (!mtrr_if) return -ENXIO; max = num_var_ranges; /* No CPU hotplug when we change MTRR entries */ lock_cpu_hotplug(); mutex_lock(&mtrr_mutex); if (reg < 0) { /* Search for existing MTRR */ for (i = 0; i < max; ++i) { mtrr_if->get(i, &lbase, &lsize, <ype); if (lbase == base && lsize == size) { reg = i; break; } } if (reg < 0) { printk(KERN_DEBUG "mtrr: no MTRR for %lx000,%lx000 found\n", base, size); goto out; } } if (reg >= max) { printk(KERN_WARNING "mtrr: register: %d too big\n", reg); goto out; } if (is_cpu(CYRIX) && !use_intel()) { if ((reg == 3) && arr3_protected) { printk(KERN_WARNING "mtrr: ARR3 cannot be changed\n"); goto out; } } mtrr_if->get(reg, &lbase, &lsize, <ype); if (lsize < 1) { printk(KERN_WARNING "mtrr: MTRR %d not used\n", reg); goto out; } if (usage_table[reg] < 1) { printk(KERN_WARNING "mtrr: reg: %d has count=0\n", reg); goto out; } if (--usage_table[reg] < 1) set_mtrr(reg, 0, 0, 0); error = reg; out: mutex_unlock(&mtrr_mutex); unlock_cpu_hotplug(); return error; }
static void partial_vector_expr(const Vector &x, backend::source_generator &src, const backend::command_queue &q, const std::string &prm_name, detail::kernel_generator_state_ptr state) { if (is_cpu(q)) { Csr::partial_vector_expr(x, src, q, prm_name, state); } else { Ell::partial_vector_expr(x, src, q, prm_name, state); } }
void set_mtrr_cache_disable(struct set_mtrr_context *ctxt) { if (use_intel()) /* Disable MTRRs, and set the default type to uncached */ mtrr_wrmsr(MTRRdefType_MSR, ctxt->deftype_lo & 0xf300UL, ctxt->deftype_hi); else if (is_cpu(CYRIX)) /* Cyrix ARRs - everything else were excluded at the top */ setCx86(CX86_CCR3, (ctxt->ccr3 & 0x0f) | 0x10); }
static void local_terminal_init(const Vector &x, backend::source_generator &src, const backend::command_queue &q, const std::string &prm_name, detail::kernel_generator_state_ptr state) { if (is_cpu(q)) { Csr::local_terminal_init(x, src, q, prm_name, state); } else { Ell::local_terminal_init(x, src, q, prm_name, state); } }
int reduce_by_key_sink( IKTuple &&ikeys, vector<V> const &ivals, OKTuple &&okeys, vector<V> &ovals, Comp, Oper ) { namespace fusion = boost::fusion; typedef typename extract_value_types<IKTuple>::type K; static_assert( std::is_same<K, typename extract_value_types<OKTuple>::type>::value, "Incompatible input and output key types"); precondition( fusion::at_c<0>(ikeys).nparts() == 1 && ivals.nparts() == 1, "reduce_by_key is only supported for single device contexts" ); precondition(fusion::at_c<0>(ikeys).size() == ivals.size(), "keys and values should have same size" ); const auto &queue = fusion::at_c<0>(ikeys).queue_list(); backend::select_context(queue[0]); const int NT_cpu = 1; const int NT_gpu = 256; const int NT = is_cpu(queue[0]) ? NT_cpu : NT_gpu; size_t count = fusion::at_c<0>(ikeys).size(); size_t num_blocks = (count + NT - 1) / NT; size_t scan_buf_size = alignup(num_blocks, NT); backend::device_vector<int> key_sum (queue[0], scan_buf_size); backend::device_vector<V> pre_sum (queue[0], scan_buf_size); backend::device_vector<V> post_sum (queue[0], scan_buf_size); backend::device_vector<V> offset_val(queue[0], count); backend::device_vector<int> offset (queue[0], count); /***** Kernel 0 *****/ auto krn0 = offset_calculation<K, Comp>(queue[0]); krn0.push_arg(count); boost::fusion::for_each(ikeys, do_push_arg(krn0)); krn0.push_arg(offset); krn0(queue[0]); VEX_FUNCTION(int, plus, (int, x)(int, y), return x + y;);
matrix( const std::vector<backend::command_queue> &q, size_t nrows, size_t ncols, const PtrRange &ptr, const ColRange &col, const ValRange &val, bool fast_setup = true ) : q(q[0]) { if (is_cpu(q[0])) { Acpu = std::make_shared<Csr>(q, nrows, ncols, ptr, col, val); } else { Agpu = std::make_shared<Ell>(q, nrows, ncols, ptr, col, val, fast_setup); } }
/// Select best launch configuration for the given shared memory requirements. void config(const cl::CommandQueue &queue, std::function<size_t(size_t)> smem) { cl::Device dev = queue.getInfo<CL_QUEUE_DEVICE>(); if ( is_cpu(queue) ) { w_size = 1; } else { // Select workgroup size that would fit into the device. w_size = dev.getInfo<CL_DEVICE_MAX_WORK_ITEM_SIZES>()[0] / 2; size_t max_ws = max_threads_per_block(queue); size_t max_smem = max_shared_memory_per_block(queue); // Reduce workgroup size until it satisfies resource requirements: while( (w_size > max_ws) || (smem(w_size) > max_smem) ) w_size /= 2; } g_size = w_size * num_workgroups(queue); }
void kernel_arg_setter(const Vector &x, backend::kernel &kernel, unsigned part, size_t index_offset, detail::kernel_generator_state_ptr state) const { if (is_cpu(q)) { if (Acpu) { Acpu->kernel_arg_setter(x, kernel, part, index_offset, state); } else { Csr dummy_A(q); dummy_A.kernel_arg_setter(x, kernel, part, index_offset, state); } } else { if (Agpu) { Agpu->kernel_arg_setter(x, kernel, part, index_offset, state); } else { Ell dummy_A(q); dummy_A.kernel_arg_setter(x, kernel, part, index_offset, state); } } }
/// Select best launch configuration for the given shared memory requirements. void config(const boost::compute::command_queue &queue, std::function<size_t(size_t)> smem) { boost::compute::device dev = queue.get_device(); size_t ws; if ( is_cpu(queue) ) { ws = 1; } else { // Select workgroup size that would fit into the device. ws = dev.get_info<std::vector<size_t>>(CL_DEVICE_MAX_WORK_ITEM_SIZES)[0] / 2; size_t max_ws = max_threads_per_block(queue); size_t max_smem = max_shared_memory_per_block(queue); // Reduce workgroup size until it satisfies resource requirements: while( (ws > max_ws) || (smem(ws) > max_smem) ) ws /= 2; } config(num_workgroups(queue), ws); }
backend::device_vector<int> offset (queue[0], count); /***** Kernel 0 *****/ auto krn0 = offset_calculation<K, Comp>(queue[0]); krn0.push_arg(count); boost::fusion::for_each(ikeys, do_push_arg(krn0)); krn0.push_arg(offset); krn0(queue[0]); VEX_FUNCTION(int, plus, (int, x)(int, y), return x + y;); scan(queue[0], offset, offset, 0, false, plus); /***** Kernel 1 *****/ auto krn1 = is_cpu(queue[0]) ? block_scan_by_key<NT_cpu, V, Oper>(queue[0]) : block_scan_by_key<NT_gpu, V, Oper>(queue[0]); krn1.push_arg(count); krn1.push_arg(offset); krn1.push_arg(ivals(0)); krn1.push_arg(offset_val); krn1.push_arg(key_sum); krn1.push_arg(pre_sum); krn1.config(num_blocks, NT); krn1(queue[0]); /***** Kernel 2 *****/ uint work_per_thread = std::max<uint>(1U, static_cast<uint>(scan_buf_size / NT));
inline kernel_call transpose_kernel( const backend::command_queue &queue, size_t width, size_t height, const backend::device_vector<T2> &in, const backend::device_vector<T2> &out ) { scoped_program_header header(queue, fft_kernel_header<T>()); backend::source_generator o(queue); o << std::setprecision(25); // determine max block size to fit into local memory/workgroup size_t block_size = is_cpu(queue) ? 1 : 128; { #if defined(VEXCL_BACKEND_OPENCL) || defined(VEXCL_BACKEND_COMPUTE) cl_device_id dev = backend::get_device_id(queue); cl_ulong local_size; size_t workgroup; clGetDeviceInfo(dev, CL_DEVICE_LOCAL_MEM_SIZE, sizeof(cl_ulong), &local_size, NULL); clGetDeviceInfo(dev, CL_DEVICE_MAX_WORK_GROUP_SIZE, sizeof(size_t), &workgroup, NULL); #else const auto local_size = queue.device().max_shared_memory_per_block(); const auto workgroup = queue.device().max_threads_per_block(); #endif while(block_size * block_size * sizeof(T) * 2 > local_size) block_size /= 2; while(block_size * block_size > workgroup) block_size /= 2; } // from NVIDIA SDK. o.begin_kernel("transpose"); o.begin_kernel_parameters(); o.template parameter< global_ptr<const T2> >("input"); o.template parameter< global_ptr< T2> >("output"); o.template parameter< cl_uint >("width"); o.template parameter< cl_uint >("height"); o.end_kernel_parameters(); o.new_line() << "const size_t global_x = " << o.global_id(0) << ";"; o.new_line() << "const size_t global_y = " << o.global_id(1) << ";"; o.new_line() << "const size_t local_x = " << o.local_id(0) << ";"; o.new_line() << "const size_t local_y = " << o.local_id(1) << ";"; o.new_line() << "const bool range = global_x < width && global_y < height;"; // local memory { std::ostringstream s; s << "block[" << block_size * block_size << "]"; o.smem_static_var(type_name<T2>(), s.str()); } // copy from input to local memory o.new_line() << "if(range) " << "block[local_x + local_y * " << block_size << "] = input[global_x + global_y * width];"; // wait until the whole block is filled o.new_line().barrier(); // transpose local block to target o.new_line() << "if(range) " << "output[global_x * height + global_y] = block[local_x + local_y * " << block_size << "];"; o.end_kernel(); backend::kernel kernel(queue, o.str(), "transpose"); kernel.push_arg(in); kernel.push_arg(out); kernel.push_arg(static_cast<cl_uint>(width)); kernel.push_arg(static_cast<cl_uint>(height)); // range multiple of wg size, last block maybe not completely filled. size_t r_w = (width + block_size - 1) / block_size; size_t r_h = (height + block_size - 1) / block_size; kernel.config(backend::ndrange(r_w, r_h), backend::ndrange(block_size, block_size)); std::ostringstream desc; desc << "transpose{" << "w=" << width << "(" << r_w << "), " << "h=" << height << "(" << r_h << "), " << "bs=" << block_size << "}"; return kernel_call(false, desc.str(), kernel); }
typename std::enable_if< boost::proto::matches< typename boost::proto::result_of::as_expr<Expr>::type, multivector_expr_grammar >::value, const multivector& >::type operator=(const Expr& expr) { static kernel_cache cache; const std::vector<cl::CommandQueue> &queue = vec[0]->queue_list(); // If any device in context is CPU, then do not fuse the kernel, // but assign components individually. if (std::any_of(queue.begin(), queue.end(), [](const cl::CommandQueue &q) { return is_cpu(qdev(q)); })) { assign_subexpressions<0, N>(boost::proto::as_child(expr)); return *this; } for(uint d = 0; d < queue.size(); d++) { cl::Context context = qctx(queue[d]); cl::Device device = qdev(queue[d]); auto kernel = cache.find( context() ); if (kernel == cache.end()) { std::ostringstream kernel_name; kernel_name << "multi_"; vector_name_context name_ctx(kernel_name); boost::proto::eval(boost::proto::as_child(expr), name_ctx); std::ostringstream source; source << standard_kernel_header(device); extract_user_functions()( boost::proto::as_child(expr), declare_user_function(source) ); source << "kernel void " << kernel_name.str() << "(\n\t" << type_name<size_t>() << " n"; for(size_t i = 0; i < N; ) source << ",\n\tglobal " << type_name<T>() << " *res_" << ++i; build_param_list<N>(boost::proto::as_child(expr), source); source << "\n)\n{\n" "\tfor(size_t idx = get_global_id(0); idx < n; idx += get_global_size(0)) {\n"; build_expr_list(boost::proto::as_child(expr), source); source << "\t}\n}\n"; auto program = build_sources(context, source.str()); cl::Kernel krn(program, kernel_name.str().c_str()); size_t wgs = kernel_workgroup_size(krn, device); kernel = cache.insert(std::make_pair( context(), kernel_cache_entry(krn, wgs) )).first; } if (size_t psize = vec[0]->part_size(d)) { size_t w_size = kernel->second.wgsize; size_t g_size = num_workgroups(device) * w_size; uint pos = 0; kernel->second.kernel.setArg(pos++, psize); for(uint i = 0; i < N; i++) kernel->second.kernel.setArg(pos++, vec[i]->operator()(d)); set_kernel_args<N>( boost::proto::as_child(expr), kernel->second.kernel, d, pos, vec[0]->part_start(d) ); queue[d].enqueueNDRangeKernel( kernel->second.kernel, cl::NullRange, g_size, w_size ); } } return *this; }
typename std::enable_if< !boost::proto::matches< typename boost::proto::result_of::as_expr<ExprTuple>::type, multivector_full_grammar >::value #if !defined(_MSC_VER) || _MSC_VER >= 1700 && N == std::tuple_size<ExprTuple>::value #endif , const multivector& >::type operator=(const ExprTuple &expr) { #endif static kernel_cache cache; const std::vector<cl::CommandQueue> &queue = vec[0]->queue_list(); for(uint d = 0; d < queue.size(); d++) { cl::Context context = qctx(queue[d]); cl::Device device = qdev(queue[d]); auto kernel = cache.find( context() ); if (kernel == cache.end()) { std::ostringstream source; source << standard_kernel_header(device); { get_header f(source); for_each<0>(expr, f); } source << "kernel void multi_expr_tuple(\n" "\t" << type_name<size_t>() << " n"; for(uint i = 1; i <= N; i++) source << ",\n\tglobal " << type_name<T>() << " *res_" << i; { get_params f(source); for_each<0>(expr, f); } source << "\n)\n{\n"; if ( is_cpu(device) ) { source << "\tsize_t chunk_size = (n + get_global_size(0) - 1) / get_global_size(0);\n" "\tsize_t chunk_start = get_global_id(0) * chunk_size;\n" "\tsize_t chunk_end = min(n, chunk_start + chunk_size);\n" "\tfor(size_t idx = chunk_start; idx < chunk_end; ++idx) {\n"; } else { source << "\tfor(size_t idx = get_global_id(0); idx < n; idx += get_global_size(0)) {\n"; } { get_expressions f(source); for_each<0>(expr, f); } source << "\n"; for(uint i = 1; i <= N; i++) source << "\t\tres_" << i << "[idx] = buf_" << i << ";\n"; source << "\t}\n}\n"; auto program = build_sources(context, source.str()); cl::Kernel krn(program, "multi_expr_tuple"); size_t wgs = kernel_workgroup_size(krn, device); kernel = cache.insert(std::make_pair( context(), kernel_cache_entry(krn, wgs) )).first; } if (size_t psize = vec[0]->part_size(d)) { size_t w_size = kernel->second.wgsize; size_t g_size = num_workgroups(device) * w_size; uint pos = 0; kernel->second.kernel.setArg(pos++, psize); for(uint i = 0; i < N; i++) kernel->second.kernel.setArg(pos++, (*vec[i])(d)); { set_arguments f(kernel->second.kernel, d, pos, vec[0]->part_start(d)); for_each<0>(expr, f); } queue[d].enqueueNDRangeKernel( kernel->second.kernel, cl::NullRange, g_size, w_size ); } } return *this; }
void scan( backend::command_queue const &queue, backend::device_vector<T> const &input, backend::device_vector<T> &output, T init, bool exclusive, Oper ) { precondition( input.size() == output.size(), "Wrong output size in inclusive_scan" ); backend::select_context(queue); const int NT_cpu = 1; const int NT_gpu = 256; const int NT = is_cpu(queue) ? NT_cpu : NT_gpu; const int NT2 = 2 * NT; int do_exclusive = exclusive ? 1 : 0; const size_t count = input.size(); const size_t num_blocks = (count + NT2 - 1) / NT2; const size_t scan_buf_size = alignup(num_blocks, NT2); backend::device_vector<T> pre_sum1(queue, scan_buf_size); backend::device_vector<T> pre_sum2(queue, scan_buf_size); backend::device_vector<T> post_sum(queue, scan_buf_size); // Kernel0 auto krn0 = is_cpu(queue) ? block_inclusive_scan<NT_cpu, T, Oper>(queue) : block_inclusive_scan<NT_gpu, T, Oper>(queue); krn0.push_arg(count); krn0.push_arg(input); krn0.push_arg(init); krn0.push_arg(pre_sum1); krn0.push_arg(pre_sum2); krn0.push_arg(do_exclusive); krn0.config(num_blocks, NT); krn0(queue); // Kernel1 auto krn1 = is_cpu(queue) ? intra_block_inclusive_scan<NT_cpu, T, Oper>(queue) : intra_block_inclusive_scan<NT_gpu, T, Oper>(queue); uint work_per_thread = std::max<uint>(1U, static_cast<uint>(scan_buf_size / NT)); krn1.push_arg(num_blocks); krn1.push_arg(post_sum); krn1.push_arg(pre_sum1); krn1.push_arg(init); krn1.push_arg(work_per_thread); krn1.config(1, NT); krn1(queue); // Kernel2 auto krn2 = is_cpu(queue) ? block_addition<NT_cpu, T, Oper>(queue) : block_addition<NT_gpu, T, Oper>(queue); krn2.push_arg(count); krn2.push_arg(input); krn2.push_arg(output); krn2.push_arg(post_sum); krn2.push_arg(pre_sum2); krn2.push_arg(init); krn2.push_arg(do_exclusive); krn2.config(num_blocks * 2, NT); krn2(queue); }
int reduce_by_key_sink( IKTuple &&ikeys, vector<V> const &ivals, OKTuple &&okeys, vector<V> &ovals, Comp, Oper ) { namespace fusion = boost::fusion; typedef typename extract_value_types<IKTuple>::type K; static_assert( std::is_same<K, typename extract_value_types<OKTuple>::type>::value, "Incompatible input and output key types"); precondition( fusion::at_c<0>(ikeys).nparts() == 1 && ivals.nparts() == 1, "Sorting is only supported for single device contexts" ); precondition(fusion::at_c<0>(ikeys).size() == ivals.size(), "keys and values should have same size" ); const auto &queue = fusion::at_c<0>(ikeys).queue_list(); backend::select_context(queue[0]); const int NT_cpu = 1; const int NT_gpu = 256; const int NT = is_cpu(queue[0]) ? NT_cpu : NT_gpu; size_t count = fusion::at_c<0>(ikeys).size(); size_t num_blocks = (count + NT - 1) / NT; size_t scan_buf_size = alignup(num_blocks, NT); backend::device_vector<int> key_sum (queue[0], scan_buf_size); backend::device_vector<V> pre_sum (queue[0], scan_buf_size); backend::device_vector<V> post_sum (queue[0], scan_buf_size); backend::device_vector<V> offset_val(queue[0], count); backend::device_vector<int> offset (queue[0], count); /***** Kernel 0 *****/ auto krn0 = detail::offset_calculation<K, Comp>(queue[0]); krn0.push_arg(count); boost::fusion::for_each(ikeys, do_push_arg(krn0)); krn0.push_arg(offset); krn0(queue[0]); VEX_FUNCTION(plus, int(int, int), "return prm1 + prm2;"); detail::scan(queue[0], offset, offset, 0, false, plus); /***** Kernel 1 *****/ auto krn1 = is_cpu(queue[0]) ? detail::block_scan_by_key<NT_cpu, V, Oper>(queue[0]) : detail::block_scan_by_key<NT_gpu, V, Oper>(queue[0]); krn1.push_arg(count); krn1.push_arg(offset); krn1.push_arg(ivals(0)); krn1.push_arg(offset_val); krn1.push_arg(key_sum); krn1.push_arg(pre_sum); krn1.config(num_blocks, NT); krn1(queue[0]); /***** Kernel 2 *****/ uint work_per_thread = std::max<uint>(1U, static_cast<uint>(scan_buf_size / NT)); auto krn2 = is_cpu(queue[0]) ? detail::block_inclusive_scan_by_key<NT_cpu, V, Oper>(queue[0]) : detail::block_inclusive_scan_by_key<NT_gpu, V, Oper>(queue[0]); krn2.push_arg(num_blocks); krn2.push_arg(key_sum); krn2.push_arg(pre_sum); krn2.push_arg(post_sum); krn2.push_arg(work_per_thread); krn2.config(1, NT); krn2(queue[0]); /***** Kernel 3 *****/ auto krn3 = detail::block_sum_by_key<V, Oper>(queue[0]); krn3.push_arg(count); krn3.push_arg(key_sum); krn3.push_arg(post_sum); krn3.push_arg(offset); krn3.push_arg(offset_val); krn3.config(num_blocks, NT); krn3(queue[0]); /***** resize okeys and ovals *****/ int out_elements; offset.read(queue[0], count - 1, 1, &out_elements, true); ++out_elements; boost::fusion::for_each(okeys, do_vex_resize(queue, out_elements)); ovals.resize(ivals.queue_list(), out_elements); /***** Kernel 4 *****/ auto krn4 = detail::key_value_mapping<K, V>(queue[0]); krn4.push_arg(count); boost::fusion::for_each(ikeys, do_push_arg(krn4)); boost::fusion::for_each(okeys, do_push_arg(krn4)); krn4.push_arg(ovals(0)); krn4.push_arg(offset); krn4.push_arg(offset_val); krn4(queue[0]); return out_elements; }
source_generator(const command_queue &queue) : indent(0), first_prm(true), cpu( is_cpu(queue) ) { src << standard_kernel_header(queue); }