void init_random(viennacl::vector<T> & x) { std::vector<T> cx(x.internal_size()); for (std::size_t i = 0; i < cx.size(); ++i) cx[i] = T(rand())/T(RAND_MAX); viennacl::fast_copy(&cx[0], &cx[0] + cx.size(), x.begin()); }
ScalarType diff_2(ublas::vector<ScalarType> & v1, viennacl::vector<ScalarType> & v2) { ublas::vector<ScalarType> v2_cpu(v2.size()); viennacl::copy(v2.begin(), v2.end(), v2_cpu.begin()); return norm_2(v1 - v2_cpu) / norm_2(v1); }
NumericT diff(std::vector<NumericT> const & v1, viennacl::vector<NumericT> const & v2) { std::vector<NumericT> v2_cpu(v2.size()); viennacl::backend::finish(); viennacl::copy(v2.begin(), v2.end(), v2_cpu.begin()); for (std::size_t i=0;i<v1.size(); ++i) { if ( std::max( std::fabs(v2_cpu[i]), std::fabs(v1[i]) ) > 0 ) v2_cpu[i] = std::fabs(v2_cpu[i] - v1[i]) / std::max( std::fabs(v2_cpu[i]), std::fabs(v1[i]) ); else v2_cpu[i] = 0.0; if (v2_cpu[i] > 0.0001) { //std::cout << "Neighbor: " << i-1 << ": " << v1[i-1] << " vs. " << v2_cpu[i-1] << std::endl; std::cout << "Error at entry " << i << ": " << v1[i] << " vs. " << v2[i] << std::endl; //std::cout << "Neighbor: " << i+1 << ": " << v1[i+1] << " vs. " << v2_cpu[i+1] << std::endl; exit(EXIT_FAILURE); } } NumericT inf_norm = 0; for (std::size_t i=0;i<v2_cpu.size(); ++i) inf_norm = std::max<NumericT>(inf_norm, std::fabs(v2_cpu[i])); return inf_norm; }
ScalarType diff(ublas::vector<ScalarType> & v1, viennacl::vector<ScalarType> & v2) { ublas::vector<ScalarType> v2_cpu(v2.size()); viennacl::backend::finish(); viennacl::copy(v2.begin(), v2.end(), v2_cpu.begin()); for (unsigned int i=0;i<v1.size(); ++i) { if ( std::max( std::fabs(v2_cpu[i]), std::fabs(v1[i]) ) > 0 ) { //if (std::max( std::fabs(v2_cpu[i]), std::fabs(v1[i]) ) < 1e-10 ) //absolute tolerance (avoid round-off issues) // v2_cpu[i] = 0; //else v2_cpu[i] = std::fabs(v2_cpu[i] - v1[i]) / std::max( std::fabs(v2_cpu[i]), std::fabs(v1[i]) ); } else v2_cpu[i] = 0.0; if (v2_cpu[i] > 0.0001) { //std::cout << "Neighbor: " << i-1 << ": " << v1[i-1] << " vs. " << v2_cpu[i-1] << std::endl; std::cout << "Error at entry " << i << ": " << v1[i] << " vs. " << v2_cpu[i] << std::endl; //std::cout << "Neighbor: " << i+1 << ": " << v1[i+1] << " vs. " << v2_cpu[i+1] << std::endl; exit(EXIT_FAILURE); } } return norm_inf(v2_cpu); }
void apply(viennacl::vector<ScalarType, ALIGNMENT> & vec) const { assert(system_matrix.size1() == vec.size()); //run kernel: viennacl::ocl::kernel & k = viennacl::ocl::get_kernel(viennacl::linalg::kernels::vector<ScalarType, ALIGNMENT>::program_name(), "diag_precond"); viennacl::ocl::enqueue( k(diag_A_inv, vec, static_cast<cl_uint>(vec.size())) ); }
void prod_impl(const viennacl::hankel_matrix<SCALARTYPE, ALIGNMENT> & mat, const viennacl::vector<SCALARTYPE, VECTOR_ALIGNMENT> & vec, viennacl::vector<SCALARTYPE, VECTOR_ALIGNMENT> & result) { assert(mat.size1() == result.size()); assert(mat.size2() == vec.size()); prod_impl(mat.elements(), vec, result); viennacl::detail::fft::reverse(result); }
ScalarType diff ( ublas::vector<ScalarType> & v1, viennacl::vector<ScalarType,Alignment> & v2 ) { ublas::vector<ScalarType> v2_cpu ( v2.size() ); viennacl::copy( v2.begin(), v2.end(), v2_cpu.begin() ); for ( unsigned int i=0; i<v1.size(); ++i ) { if ( std::max ( fabs ( v2_cpu[i] ), fabs ( v1[i] ) ) > 0 ) v2_cpu[i] = fabs ( v2_cpu[i] - v1[i] ) / std::max ( fabs ( v2_cpu[i] ), fabs ( v1[i] ) ); else v2_cpu[i] = 0.0; } return norm_inf ( v2_cpu ); }
void prod_impl(const viennacl::compressed_matrix<TYPE, ALIGNMENT> & mat, const viennacl::vector<TYPE, VECTOR_ALIGNMENT> & vec, viennacl::vector<TYPE, VECTOR_ALIGNMENT> & result, size_t NUM_THREADS = 0) { assert(mat.size1() == result.size()); assert(mat.size2() == vec.size()); viennacl::ocl::kernel & k = viennacl::ocl::get_kernel(viennacl::linalg::kernels::compressed_matrix<TYPE, ALIGNMENT>::program_name(), "vec_mul"); viennacl::ocl::enqueue(k(mat.handle1(), mat.handle2(), mat.handle(), vec, result, static_cast<cl_uint>(mat.size1()))); }
bool bisect(const viennacl::vector<NumericT> & diagonal, const viennacl::vector<NumericT> & superdiagonal, viennacl::vector<NumericT> & eigenvalues) { assert(diagonal.size() == superdiagonal.size() && diagonal.size() == eigenvalues.size() && bool("Input vectors do not have the same sizes!")); bool bResult = false; // flag if the matrix size is due to explicit user request // desired precision of eigenvalues NumericT precision = static_cast<NumericT>(0.00001); const unsigned int mat_size = static_cast<unsigned int>(diagonal.size()); // set up input viennacl::linalg::detail::InputData<NumericT> input(diagonal, superdiagonal, mat_size); NumericT lg = FLT_MAX; NumericT ug = -FLT_MAX; // compute Gerschgorin interval viennacl::linalg::detail::computeGerschgorin(input.std_a, input.std_b, mat_size, lg, ug); // decide wheter the algorithm for small or for large matrices will be started if (mat_size <= VIENNACL_BISECT_MAX_SMALL_MATRIX) { // initialize memory for result viennacl::linalg::detail::ResultDataSmall<NumericT> result(mat_size); // run the kernel viennacl::linalg::detail::computeEigenvaluesSmallMatrix(input, result, mat_size, lg, ug, precision); // get the result from the device and do some sanity checks, viennacl::linalg::detail::processResultSmallMatrix(result, mat_size); copy(result.std_eigenvalues, eigenvalues); bResult = true; } else { // initialize memory for result viennacl::linalg::detail::ResultDataLarge<NumericT> result(mat_size); // run the kernel viennacl::linalg::detail::computeEigenvaluesLargeMatrix(input, result, mat_size, lg, ug, precision); // get the result from the device and do some sanity checks bResult = viennacl::linalg::detail::processResultDataLargeMatrix(result, mat_size); copy(result.std_eigenvalues, eigenvalues); } return bResult; }
static void test_scan_values(viennacl::vector<ScalarType> const & input, viennacl::vector<ScalarType> & result, bool is_inclusive_scan) { std::vector<ScalarType> host_input(input.size()); std::vector<ScalarType> host_result(result.size()); viennacl::copy(input, host_input); viennacl::copy(result, host_result); ScalarType sum = 0; if (is_inclusive_scan) { for(viennacl::vcl_size_t i = 0; i < input.size(); i++) { sum += host_input[i]; host_input[i] = sum; } } else { for(viennacl::vcl_size_t i = 0; i < input.size(); i++) { ScalarType tmp = host_input[i]; host_input[i] = sum; sum += tmp; } } for(viennacl::vcl_size_t i = 0; i < input.size(); i++) { if (host_input[i] != host_result[i]) { std::cout << "Fail at vector index " << i << std::endl; std::cout << " result[" << i << "] = " << host_result[i] << std::endl; std::cout << " Reference = " << host_input[i] << std::endl; if (i > 0) { std::cout << " previous result[" << i-1 << "] = " << host_result[i-1] << std::endl; std::cout << " previous Reference = " << host_input[i-1] << std::endl; } exit(EXIT_FAILURE); } } std::cout << "PASSED!" << std::endl; }
void split_calc_distance(std::vector<double>& to_sort,viennacl::ocl::context* p_context, int num_splits, naive_knn& knn, viennacl::vector<double>& distances, dense_sliding_window& sliding_window, int num_instances, viennacl::vector<double>& sample) { int len = num_instances / num_splits; auto gpu_begin = distances.begin(); auto gpu_end = gpu_begin + len; int last = num_instances - len * num_splits; int current = 0; knn.calc_distance(distances, sliding_window, current, current+len, sample); current += len; for (; current < num_instances; current += len) { p_context->get_queue().finish(); viennacl::copy(gpu_begin, gpu_end, to_sort.begin()); knn.calc_distance(distances, sliding_window, current, current+len, sample); std::sort(to_sort.begin(), to_sort.end()); } p_context->get_queue().finish(); viennacl::copy(gpu_begin, gpu_end, to_sort.begin()); std::sort(to_sort.begin(), to_sort.end()); if (last > 0) { //knn.calc_distance(distances, sliding_window, current -len, current + last, sample); } }
static void init_vector(viennacl::vector<ScalarType>& vcl_v) { std::vector<ScalarType> v(vcl_v.size()); for (std::size_t i = 0; i < v.size(); ++i) v[i] = ScalarType(i % 7 + 1); viennacl::copy(v, vcl_v); }
ScalarType diff(ublas::vector<ScalarType> & v1, viennacl::vector<ScalarType> & v2) { ublas::vector<ScalarType> v2_cpu(v2.size()); viennacl::backend::finish(); //workaround for a bug in APP SDK 2.7 on Trinity APUs (with Catalyst 12.8) viennacl::copy(v2.begin(), v2.end(), v2_cpu.begin()); for (std::size_t i=0;i<v1.size(); ++i) { if ( std::max( std::fabs(v2_cpu[i]), std::fabs(v1[i]) ) > 0 ) v2_cpu[i] = std::fabs(v2_cpu[i] - v1[i]) / std::max( std::fabs(v2_cpu[i]), std::fabs(v1[i]) ); else v2_cpu[i] = 0.0; } return norm_inf(v2_cpu); }
void prepare_householder_vector( viennacl::matrix<SCALARTYPE, row_major, ALIGNMENT>& A, viennacl::vector<SCALARTYPE, ALIGNMENT>& D, vcl_size_t size, vcl_size_t row_start, vcl_size_t col_start, vcl_size_t start, bool is_column ) { boost::numeric::ublas::vector<SCALARTYPE> tmp = boost::numeric::ublas::scalar_vector<SCALARTYPE>(size, 0); copy_vec(A, D, row_start, col_start, is_column); fast_copy(D.begin(), D.begin() + vcl_ptrdiff_t(size - start), tmp.begin() + start); //std::cout << "1: " << tmp << "\n"; detail::householder_vector(tmp, start); fast_copy(tmp, D); //std::cout << "2: " << D << "\n"; }
void prod_impl(const viennacl::toeplitz_matrix<SCALARTYPE, ALIGNMENT> & mat, const viennacl::vector<SCALARTYPE, VECTOR_ALIGNMENT> & vec, viennacl::vector<SCALARTYPE, VECTOR_ALIGNMENT> & result) { assert(mat.size1() == result.size()); assert(mat.size2() == vec.size()); viennacl::vector<SCALARTYPE, VECTOR_ALIGNMENT> tmp(vec.size() * 4); tmp.clear(); viennacl::vector<SCALARTYPE, VECTOR_ALIGNMENT> tmp2(vec.size() * 4); viennacl::vector<SCALARTYPE, VECTOR_ALIGNMENT> tep(mat.elements().size() * 2); viennacl::detail::fft::real_to_complex(mat.elements(), tep, mat.elements().size()); copy(vec, tmp); viennacl::detail::fft::real_to_complex(tmp, tmp2, vec.size() * 2); viennacl::linalg::convolve(tep, tmp2, tmp); viennacl::detail::fft::complex_to_real(tmp, tmp2, vec.size() * 2); copy(tmp2.begin(), tmp2.begin() + vec.size(), result.begin()); }
void apply(viennacl::vector<NumericT> & vec) const { if (vec.handle().get_active_handle_id() != viennacl::MAIN_MEMORY) { if (tag_.use_level_scheduling()) { //std::cout << "Using multifrontal on GPU..." << std::endl; detail::level_scheduling_substitute(vec, multifrontal_L_row_index_arrays_, multifrontal_L_row_buffers_, multifrontal_L_col_buffers_, multifrontal_L_element_buffers_, multifrontal_L_row_elimination_num_list_); vec = viennacl::linalg::element_div(vec, multifrontal_U_diagonal_); detail::level_scheduling_substitute(vec, multifrontal_U_row_index_arrays_, multifrontal_U_row_buffers_, multifrontal_U_col_buffers_, multifrontal_U_element_buffers_, multifrontal_U_row_elimination_num_list_); } else { viennacl::context host_context(viennacl::MAIN_MEMORY); viennacl::context old_context = viennacl::traits::context(vec); viennacl::switch_memory_context(vec, host_context); viennacl::linalg::inplace_solve(LU_, vec, unit_lower_tag()); viennacl::linalg::inplace_solve(LU_, vec, upper_tag()); viennacl::switch_memory_context(vec, old_context); } } else //apply ILUT directly: { viennacl::linalg::inplace_solve(LU_, vec, unit_lower_tag()); viennacl::linalg::inplace_solve(LU_, vec, upper_tag()); } }
static void apply(::viennacl::vector<V> &x) { x.clear(); }
static bool same_size( const viennacl::vector< T > &x1 , const viennacl::vector< T > &x2 ) { return x1.size() == x2.size(); }
static void resize( viennacl::vector< T > &x1 , const viennacl::vector< T > &x2 ) { x1.resize( x2.size() , false ); }
void fill_vector(viennacl::vector<NumericT> & vec) { for (std::size_t i = 0; i < vec.size(); ++i) vec(i) = NumericT(1.0) + NumericT(2.0) * random<NumericT>(); //some extra weight on diagonal for stability }
void clear(viennacl::vector<ScalarType, AlignmentV> & vec) { vec.clear(); }
void clear(viennacl::vector<ScalarType, ALIGNMENT> & vec) { vec.clear(); }
viennacl::vector<ScalarType> solve(MatrixType const & A, //MatrixType const & A, viennacl::vector<ScalarType> const & rhs, bicgstab_tag const & tag, viennacl::linalg::no_precond) { viennacl::vector<ScalarType> result = viennacl::zero_vector<ScalarType>(rhs.size(), viennacl::traits::context(rhs)); viennacl::vector<ScalarType> residual = rhs; viennacl::vector<ScalarType> p = rhs; viennacl::vector<ScalarType> r0star = rhs; viennacl::vector<ScalarType> Ap = rhs; viennacl::vector<ScalarType> s = rhs; viennacl::vector<ScalarType> As = rhs; // Layout of temporary buffer: // chunk 0: <residual, r_0^*> // chunk 1: <As, As> // chunk 2: <As, s> // chunk 3: <Ap, r_0^*> // chunk 4: <As, r_0^*> // chunk 5: <s, s> vcl_size_t buffer_size_per_vector = 256; vcl_size_t num_buffer_chunks = 6; viennacl::vector<ScalarType> inner_prod_buffer = viennacl::zero_vector<ScalarType>(num_buffer_chunks*buffer_size_per_vector, viennacl::traits::context(rhs)); // temporary buffer std::vector<ScalarType> host_inner_prod_buffer(inner_prod_buffer.size()); ScalarType norm_rhs_host = viennacl::linalg::norm_2(residual); ScalarType beta; ScalarType alpha; ScalarType omega; ScalarType residual_norm = norm_rhs_host; inner_prod_buffer[0] = norm_rhs_host * norm_rhs_host; ScalarType r_dot_r0 = 0; ScalarType As_dot_As = 0; ScalarType As_dot_s = 0; ScalarType Ap_dot_r0 = 0; ScalarType As_dot_r0 = 0; ScalarType s_dot_s = 0; if (norm_rhs_host <= 0) //solution is zero if RHS norm is zero return result; for (vcl_size_t i = 0; i < tag.max_iterations(); ++i) { tag.iters(i+1); // Ap = A*p_j // Ap_dot_r0 = <Ap, r_0^*> viennacl::linalg::pipelined_bicgstab_prod(A, p, Ap, r0star, inner_prod_buffer, buffer_size_per_vector, 3*buffer_size_per_vector); //////// first (weak) synchronization point //// ///// method 1: compute alpha on host: // //// we only need the second chunk of the buffer for computing Ap_dot_r0: //viennacl::fast_copy(inner_prod_buffer.begin(), inner_prod_buffer.end(), host_inner_prod_buffer.begin()); //Ap_dot_r0 = std::accumulate(host_inner_prod_buffer.begin() + buffer_size_per_vector, host_inner_prod_buffer.begin() + 2 * buffer_size_per_vector, ScalarType(0)); //alpha = residual_dot_r0 / Ap_dot_r0; //// s_j = r_j - alpha_j q_j //s = residual - alpha * Ap; ///// method 2: compute alpha on device: // s = r - alpha * Ap // <s, s> first stage // dump alpha at end of inner_prod_buffer viennacl::linalg::pipelined_bicgstab_update_s(s, residual, Ap, inner_prod_buffer, buffer_size_per_vector, 5*buffer_size_per_vector); // As = A*s_j // As_dot_As = <As, As> // As_dot_s = <As, s> // As_dot_r0 = <As, r_0^*> viennacl::linalg::pipelined_bicgstab_prod(A, s, As, r0star, inner_prod_buffer, buffer_size_per_vector, 4*buffer_size_per_vector); //////// second (strong) synchronization point //// viennacl::fast_copy(inner_prod_buffer.begin(), inner_prod_buffer.end(), host_inner_prod_buffer.begin()); r_dot_r0 = std::accumulate(host_inner_prod_buffer.begin(), host_inner_prod_buffer.begin() + buffer_size_per_vector, ScalarType(0)); As_dot_As = std::accumulate(host_inner_prod_buffer.begin() + buffer_size_per_vector, host_inner_prod_buffer.begin() + 2 * buffer_size_per_vector, ScalarType(0)); As_dot_s = std::accumulate(host_inner_prod_buffer.begin() + 2 * buffer_size_per_vector, host_inner_prod_buffer.begin() + 3 * buffer_size_per_vector, ScalarType(0)); Ap_dot_r0 = std::accumulate(host_inner_prod_buffer.begin() + 3 * buffer_size_per_vector, host_inner_prod_buffer.begin() + 4 * buffer_size_per_vector, ScalarType(0)); As_dot_r0 = std::accumulate(host_inner_prod_buffer.begin() + 4 * buffer_size_per_vector, host_inner_prod_buffer.begin() + 5 * buffer_size_per_vector, ScalarType(0)); s_dot_s = std::accumulate(host_inner_prod_buffer.begin() + 5 * buffer_size_per_vector, host_inner_prod_buffer.begin() + 6 * buffer_size_per_vector, ScalarType(0)); alpha = r_dot_r0 / Ap_dot_r0; beta = -1.0 * As_dot_r0 / Ap_dot_r0; omega = As_dot_s / As_dot_As; residual_norm = std::sqrt(s_dot_s - 2.0 * omega * As_dot_s + omega * omega * As_dot_As); if (std::fabs(residual_norm / norm_rhs_host) < tag.tolerance()) break; // x_{j+1} = x_j + alpha * p_j + omega * s_j // r_{j+1} = s_j - omega * t_j // p_{j+1} = r_{j+1} + beta * (p_j - omega * q_j) // and compute first stage of r_dot_r0 = <r_{j+1}, r_o^*> for use in next iteration viennacl::linalg::pipelined_bicgstab_vector_update(result, alpha, p, omega, s, residual, As, beta, Ap, r0star, inner_prod_buffer, buffer_size_per_vector); } //store last error estimate: tag.error(residual_norm / norm_rhs_host); return result; }
static size_t size(const viennacl::vector<ScalarType, A> & lhs, const RHS & rhs) { return lhs.size(); }
static size_t size2(viennacl::vector<ScalarType, A1> & lhs, viennacl::vector<ScalarType, A2> & rhs) { return rhs.size2(); }
viennacl::vector<unsigned int> bucket_select(int N, const viennacl::vector<basic_type>& in) { viennacl::vector<unsigned int> src(in.size(), viennacl::traits::context(in)); viennacl::vector<unsigned int> dst(in.size() , viennacl::traits::context(in)); // load kernels static bool init = false; static int num_groups = 1; static int wg_size = 128; if (!init) { FILE * tmp = fopen("bucket_select.cl", "rb"); fseek(tmp, 0, SEEK_END); std::vector<char> binary; binary.resize(ftell(tmp)); rewind(tmp); fread(&binary[0], binary.size(), 1, tmp); fclose(tmp); binary.push_back(0); static viennacl::context g_context = viennacl::ocl::current_context(); static bool init = false; viennacl::ocl::context* ctx = g_context.opencl_pcontext(); std::cout << "Device " << ctx->current_device().name() << std::endl; ctx->build_options("-cl-std=CL2.0 -D CL_VERSION_2_0"); std::string program_text(&binary[0]); ctx->add_program(program_text, std::string("test")); init = true; } viennacl::ocl::kernel scan_kernel = viennacl::ocl::current_context().get_kernel("test", "scan_buckets"); viennacl::ocl::kernel scatter_kernel = viennacl::ocl::current_context().get_kernel("test", "scatter_buckets"); viennacl::ocl::kernel init_offsets_kernel = viennacl::ocl::current_context().get_kernel("test", "init_offsets"); scan_kernel.local_work_size(0, wg_size); scan_kernel.global_work_size(0, wg_size * num_groups); scatter_kernel.local_work_size(0, wg_size); scatter_kernel.global_work_size(0, wg_size * num_groups); init_offsets_kernel.local_work_size(0, wg_size); init_offsets_kernel.global_work_size(0, wg_size* num_groups); cl_uint size = src.size(); viennacl::ocl::enqueue(init_offsets_kernel(size, src)); int position = 0; viennacl::vector<unsigned int> result(N, viennacl::traits::context(in)); int num_buckets = 10; viennacl::vector<unsigned int> global_histogram((num_buckets + 1) * num_groups, viennacl::traits::context(in)); // -wg_size viennacl::vector<unsigned int> global_histogram_prefix((num_buckets + 1) * num_groups + 1, viennacl::traits::context(in)); std::vector< unsigned int > global_histogram_cpu((num_buckets + 1) * num_groups + 1); int scan_start = 0; int scan_end = in.size(); basic_type pivot; basic_type base_value; int split_bucket = 0; base_value = 0; pivot = std::numeric_limits<basic_type>::max() / num_buckets; assert(pivot > 0); while (position < N) { int main = (scan_end / wg_size) * wg_size; // floor to multiple wg size int loop_end = main == scan_end ? main : main + wg_size; // add wg size if needed viennacl::ocl::enqueue(scan_kernel(in, src, scan_end, loop_end, viennacl::ocl::local_mem(sizeof(cl_uint) *wg_size), base_value, pivot, num_buckets, global_histogram)); viennacl::linalg::exclusive_scan(global_histogram, global_histogram_prefix); viennacl::copy(global_histogram_prefix, global_histogram_cpu); global_histogram_cpu[global_histogram_cpu.size() - 1] = global_histogram_cpu[global_histogram_cpu.size() - 2]; // fix last element for (split_bucket = 1; split_bucket < num_buckets; ++split_bucket) { int offset = global_histogram_cpu[num_groups * split_bucket]; if (offset >= N) break; } viennacl::ocl::enqueue(scatter_kernel( in, src, scan_end, loop_end, viennacl::ocl::local_mem(sizeof(cl_uint) *wg_size), (basic_type)base_value, (basic_type)pivot, num_buckets, split_bucket, global_histogram_prefix, dst )); int hist_max = global_histogram_cpu[num_groups * split_bucket]; int hist_min = global_histogram_cpu[num_groups * (split_bucket - 1)]; //#ifdef DEBUG_RADIX_SELECT std::vector<unsigned int> dst_cpu(in.size()); std::vector<unsigned int> src_cpu(in.size()); viennacl::copy(dst, dst_cpu); viennacl::copy(src, src_cpu); //#endif if (hist_max == N) break; if (hist_max> N && hist_min < N) { scan_start = global_histogram_cpu[num_groups * (split_bucket - 1)]; scan_end = global_histogram_cpu[num_groups * split_bucket]; if (scan_start > 0) { viennacl::copy(dst.begin(), dst.begin() + scan_start, result.begin() + position); position += scan_start; } //#ifdef DEBUG_RADIX_SELECT std::vector<unsigned int> result_cpu(in.size()); viennacl::copy(result, result_cpu); //#endif if (position >= N) break; if (scan_end == dst.size() && scan_start == 0) dst.fast_swap(src); else viennacl::copy(dst.begin() + scan_start, dst.begin() + scan_end, src.begin()); scan_end -= scan_start; } base_value += pivot * (split_bucket-1); // update pivot pivot = pivot / num_buckets; if (pivot == 0) break; } if (position <N) viennacl::copy(dst.begin(), dst.begin() + (N - position), result.begin() + position); return result; }