Exemple #1
0
void init_random(viennacl::vector<T> & x)
{
    std::vector<T> cx(x.internal_size());
    for (std::size_t i = 0; i < cx.size(); ++i)
        cx[i] = T(rand())/T(RAND_MAX);
    viennacl::fast_copy(&cx[0], &cx[0] + cx.size(), x.begin());
}
Exemple #2
0
ScalarType diff_2(ublas::vector<ScalarType> & v1, viennacl::vector<ScalarType> & v2)
{
   ublas::vector<ScalarType> v2_cpu(v2.size());
   viennacl::copy(v2.begin(), v2.end(), v2_cpu.begin());

   return norm_2(v1 - v2_cpu) / norm_2(v1);
}
Exemple #3
0
NumericT diff(std::vector<NumericT> const & v1, viennacl::vector<NumericT> const & v2)
{
  std::vector<NumericT> v2_cpu(v2.size());
  viennacl::backend::finish();
  viennacl::copy(v2.begin(), v2.end(), v2_cpu.begin());

  for (std::size_t i=0;i<v1.size(); ++i)
  {
    if ( std::max( std::fabs(v2_cpu[i]), std::fabs(v1[i]) ) > 0 )
      v2_cpu[i] = std::fabs(v2_cpu[i] - v1[i]) / std::max( std::fabs(v2_cpu[i]), std::fabs(v1[i]) );
    else
      v2_cpu[i] = 0.0;

    if (v2_cpu[i] > 0.0001)
    {
      //std::cout << "Neighbor: "      << i-1 << ": " << v1[i-1] << " vs. " << v2_cpu[i-1] << std::endl;
      std::cout << "Error at entry " << i   << ": " << v1[i]   << " vs. " << v2[i]   << std::endl;
      //std::cout << "Neighbor: "      << i+1 << ": " << v1[i+1] << " vs. " << v2_cpu[i+1] << std::endl;
      exit(EXIT_FAILURE);
    }
  }

  NumericT inf_norm = 0;
  for (std::size_t i=0;i<v2_cpu.size(); ++i)
    inf_norm = std::max<NumericT>(inf_norm, std::fabs(v2_cpu[i]));

  return inf_norm;
}
ScalarType diff(ublas::vector<ScalarType> & v1, viennacl::vector<ScalarType> & v2)
{
   ublas::vector<ScalarType> v2_cpu(v2.size());
   viennacl::backend::finish();
   viennacl::copy(v2.begin(), v2.end(), v2_cpu.begin());

   for (unsigned int i=0;i<v1.size(); ++i)
   {
      if ( std::max( std::fabs(v2_cpu[i]), std::fabs(v1[i]) ) > 0 )
      {
        //if (std::max( std::fabs(v2_cpu[i]), std::fabs(v1[i]) ) < 1e-10 )  //absolute tolerance (avoid round-off issues)
        //  v2_cpu[i] = 0;
        //else
          v2_cpu[i] = std::fabs(v2_cpu[i] - v1[i]) / std::max( std::fabs(v2_cpu[i]), std::fabs(v1[i]) );
      }
      else
         v2_cpu[i] = 0.0;

      if (v2_cpu[i] > 0.0001)
      {
        //std::cout << "Neighbor: "      << i-1 << ": " << v1[i-1] << " vs. " << v2_cpu[i-1] << std::endl;
        std::cout << "Error at entry " << i   << ": " << v1[i]   << " vs. " << v2_cpu[i]   << std::endl;
        //std::cout << "Neighbor: "      << i+1 << ": " << v1[i+1] << " vs. " << v2_cpu[i+1] << std::endl;
        exit(EXIT_FAILURE);
      }
   }

   return norm_inf(v2_cpu);
}
        void apply(viennacl::vector<ScalarType, ALIGNMENT> & vec) const
        {
          assert(system_matrix.size1() == vec.size());
          
          //run kernel:
          viennacl::ocl::kernel & k = viennacl::ocl::get_kernel(viennacl::linalg::kernels::vector<ScalarType, ALIGNMENT>::program_name(),
                                                                "diag_precond");

          viennacl::ocl::enqueue( k(diag_A_inv, vec, static_cast<cl_uint>(vec.size())) );        
        }
 void prod_impl(const viennacl::hankel_matrix<SCALARTYPE, ALIGNMENT> & mat, 
                const viennacl::vector<SCALARTYPE, VECTOR_ALIGNMENT> & vec,
                      viennacl::vector<SCALARTYPE, VECTOR_ALIGNMENT> & result)
 {
   assert(mat.size1() == result.size());
   assert(mat.size2() == vec.size());
   
   prod_impl(mat.elements(), vec, result);
   viennacl::detail::fft::reverse(result);
 }
ScalarType diff ( ublas::vector<ScalarType> & v1, viennacl::vector<ScalarType,Alignment> & v2 ) {
    ublas::vector<ScalarType> v2_cpu ( v2.size() );
    viennacl::copy( v2.begin(), v2.end(), v2_cpu.begin() );
    for ( unsigned int i=0; i<v1.size(); ++i ) {
        if ( std::max ( fabs ( v2_cpu[i] ), fabs ( v1[i] ) ) > 0 )
            v2_cpu[i] = fabs ( v2_cpu[i] - v1[i] ) / std::max ( fabs ( v2_cpu[i] ), fabs ( v1[i] ) );
        else
            v2_cpu[i] = 0.0;
    }
    return norm_inf ( v2_cpu );
}
    void prod_impl(const viennacl::compressed_matrix<TYPE, ALIGNMENT> & mat, 
                   const viennacl::vector<TYPE, VECTOR_ALIGNMENT> & vec,
                         viennacl::vector<TYPE, VECTOR_ALIGNMENT> & result, 
                   size_t NUM_THREADS = 0)
    {
      assert(mat.size1() == result.size());
      assert(mat.size2() == vec.size());

      viennacl::ocl::kernel & k = viennacl::ocl::get_kernel(viennacl::linalg::kernels::compressed_matrix<TYPE, ALIGNMENT>::program_name(), "vec_mul");
      
      viennacl::ocl::enqueue(k(mat.handle1(), mat.handle2(), mat.handle(), vec, result, static_cast<cl_uint>(mat.size1())));
    }
Exemple #9
0
bool
bisect(const viennacl::vector<NumericT> & diagonal, const viennacl::vector<NumericT> & superdiagonal, viennacl::vector<NumericT> & eigenvalues)
{
  assert(diagonal.size() == superdiagonal.size() &&
         diagonal.size() == eigenvalues.size()   &&
         bool("Input vectors do not have the same sizes!"));
  bool bResult = false;
  // flag if the matrix size is due to explicit user request
  // desired precision of eigenvalues
  NumericT  precision = static_cast<NumericT>(0.00001);
  const unsigned int mat_size = static_cast<unsigned int>(diagonal.size());

  // set up input
  viennacl::linalg::detail::InputData<NumericT> input(diagonal, superdiagonal, mat_size);

  NumericT lg =  FLT_MAX;
  NumericT ug = -FLT_MAX;
  // compute Gerschgorin interval
  viennacl::linalg::detail::computeGerschgorin(input.std_a, input.std_b, mat_size, lg, ug);

  // decide wheter the algorithm for small or for large matrices will be started
  if (mat_size <= VIENNACL_BISECT_MAX_SMALL_MATRIX)
  {
    // initialize memory for result
    viennacl::linalg::detail::ResultDataSmall<NumericT> result(mat_size);

    // run the kernel
    viennacl::linalg::detail::computeEigenvaluesSmallMatrix(input, result, mat_size, lg, ug, precision);

    // get the result from the device and do some sanity checks,
    viennacl::linalg::detail::processResultSmallMatrix(result, mat_size);
    copy(result.std_eigenvalues, eigenvalues);
    bResult = true;
  }

  else
  {
    // initialize memory for result
    viennacl::linalg::detail::ResultDataLarge<NumericT> result(mat_size);

    // run the kernel
    viennacl::linalg::detail::computeEigenvaluesLargeMatrix(input, result, mat_size, lg, ug, precision);

    // get the result from the device and do some sanity checks
    bResult = viennacl::linalg::detail::processResultDataLargeMatrix(result, mat_size);

    copy(result.std_eigenvalues, eigenvalues);
  }
  return bResult;
}
Exemple #10
0
static void test_scan_values(viennacl::vector<ScalarType> const & input,
                             viennacl::vector<ScalarType> & result,
                             bool is_inclusive_scan)
{
  std::vector<ScalarType> host_input(input.size());
  std::vector<ScalarType> host_result(result.size());

  viennacl::copy(input, host_input);
  viennacl::copy(result, host_result);

  ScalarType sum = 0;
  if (is_inclusive_scan)
  {
    for(viennacl::vcl_size_t i = 0; i < input.size(); i++)
    {
      sum += host_input[i];
      host_input[i] = sum;
    }
  }
  else
  {
    for(viennacl::vcl_size_t i = 0; i < input.size(); i++)
    {
      ScalarType tmp = host_input[i];
      host_input[i] = sum;
      sum += tmp;
    }
  }


  for(viennacl::vcl_size_t i = 0; i < input.size(); i++)
  {
    if (host_input[i] != host_result[i])
    {
      std::cout << "Fail at vector index " << i << std::endl;
      std::cout << " result[" << i << "] = " << host_result[i] << std::endl;
      std::cout << " Reference = " << host_input[i] << std::endl;
      if (i > 0)
      {
        std::cout << " previous result[" << i-1 << "] = " << host_result[i-1] << std::endl;
        std::cout << " previous Reference = " << host_input[i-1] << std::endl;
      }
      exit(EXIT_FAILURE);
    }
  }
  std::cout << "PASSED!" << std::endl;

}
Exemple #11
0
void split_calc_distance(std::vector<double>& to_sort,viennacl::ocl::context* p_context, int num_splits, naive_knn& knn, viennacl::vector<double>& distances, dense_sliding_window& sliding_window, int num_instances, viennacl::vector<double>& sample)
{
	int len = num_instances / num_splits;
	auto gpu_begin = distances.begin();
	auto gpu_end = gpu_begin + len;

	int last = num_instances - len * num_splits;
	int current = 0;
	knn.calc_distance(distances, sliding_window, current, current+len, sample);
	current += len;
	for (; current < num_instances; current += len)
	{
		p_context->get_queue().finish();
		viennacl::copy(gpu_begin, gpu_end, to_sort.begin());
		knn.calc_distance(distances, sliding_window, current, current+len, sample);
		std::sort(to_sort.begin(), to_sort.end());
	}
	p_context->get_queue().finish();
	viennacl::copy(gpu_begin, gpu_end, to_sort.begin());
	std::sort(to_sort.begin(), to_sort.end());
	if (last > 0)
	{
		//knn.calc_distance(distances, sliding_window, current -len, current + last, sample);

	}

}
Exemple #12
0
static void init_vector(viennacl::vector<ScalarType>& vcl_v)
{
  std::vector<ScalarType> v(vcl_v.size());
  for (std::size_t i = 0; i < v.size(); ++i)
    v[i] = ScalarType(i % 7 + 1);
  viennacl::copy(v, vcl_v);
}
ScalarType diff(ublas::vector<ScalarType> & v1, viennacl::vector<ScalarType> & v2)
{
   ublas::vector<ScalarType> v2_cpu(v2.size());
   viennacl::backend::finish();  //workaround for a bug in APP SDK 2.7 on Trinity APUs (with Catalyst 12.8)
   viennacl::copy(v2.begin(), v2.end(), v2_cpu.begin());

   for (std::size_t i=0;i<v1.size(); ++i)
   {
      if ( std::max( std::fabs(v2_cpu[i]), std::fabs(v1[i]) ) > 0 )
         v2_cpu[i] = std::fabs(v2_cpu[i] - v1[i]) / std::max( std::fabs(v2_cpu[i]), std::fabs(v1[i]) );
      else
         v2_cpu[i] = 0.0;
   }

   return norm_inf(v2_cpu);
}
      void prepare_householder_vector(
                                    viennacl::matrix<SCALARTYPE, row_major, ALIGNMENT>& A,
                                    viennacl::vector<SCALARTYPE, ALIGNMENT>& D,
                                    vcl_size_t size,
                                    vcl_size_t row_start,
                                    vcl_size_t col_start,
                                    vcl_size_t start,
                                    bool is_column
                                    )
      {
        boost::numeric::ublas::vector<SCALARTYPE> tmp = boost::numeric::ublas::scalar_vector<SCALARTYPE>(size, 0);

        copy_vec(A, D, row_start, col_start, is_column);
        fast_copy(D.begin(), D.begin() + vcl_ptrdiff_t(size - start), tmp.begin() + start);

        //std::cout << "1: " << tmp << "\n";

        detail::householder_vector(tmp, start);
        fast_copy(tmp, D);

        //std::cout << "2: "  << D << "\n";
      }
Exemple #15
0
void prod_impl(const viennacl::toeplitz_matrix<SCALARTYPE, ALIGNMENT> & mat,
               const viennacl::vector<SCALARTYPE, VECTOR_ALIGNMENT> & vec,
               viennacl::vector<SCALARTYPE, VECTOR_ALIGNMENT> & result)
{
    assert(mat.size1() == result.size());
    assert(mat.size2() == vec.size());

    viennacl::vector<SCALARTYPE, VECTOR_ALIGNMENT> tmp(vec.size() * 4);
    tmp.clear();
    viennacl::vector<SCALARTYPE, VECTOR_ALIGNMENT> tmp2(vec.size() * 4);

    viennacl::vector<SCALARTYPE, VECTOR_ALIGNMENT> tep(mat.elements().size() * 2);
    viennacl::detail::fft::real_to_complex(mat.elements(), tep, mat.elements().size());



    copy(vec, tmp);
    viennacl::detail::fft::real_to_complex(tmp, tmp2, vec.size() * 2);
    viennacl::linalg::convolve(tep, tmp2, tmp);
    viennacl::detail::fft::complex_to_real(tmp, tmp2, vec.size() * 2);
    copy(tmp2.begin(), tmp2.begin() + vec.size(), result.begin());
}
Exemple #16
0
  void apply(viennacl::vector<NumericT> & vec) const
  {
    if (vec.handle().get_active_handle_id() != viennacl::MAIN_MEMORY)
    {
      if (tag_.use_level_scheduling())
      {
        //std::cout << "Using multifrontal on GPU..." << std::endl;
        detail::level_scheduling_substitute(vec,
                                            multifrontal_L_row_index_arrays_,
                                            multifrontal_L_row_buffers_,
                                            multifrontal_L_col_buffers_,
                                            multifrontal_L_element_buffers_,
                                            multifrontal_L_row_elimination_num_list_);

        vec = viennacl::linalg::element_div(vec, multifrontal_U_diagonal_);

        detail::level_scheduling_substitute(vec,
                                            multifrontal_U_row_index_arrays_,
                                            multifrontal_U_row_buffers_,
                                            multifrontal_U_col_buffers_,
                                            multifrontal_U_element_buffers_,
                                            multifrontal_U_row_elimination_num_list_);
      }
      else
      {
        viennacl::context host_context(viennacl::MAIN_MEMORY);
        viennacl::context old_context = viennacl::traits::context(vec);
        viennacl::switch_memory_context(vec, host_context);
        viennacl::linalg::inplace_solve(LU_, vec, unit_lower_tag());
        viennacl::linalg::inplace_solve(LU_, vec, upper_tag());
        viennacl::switch_memory_context(vec, old_context);
      }
    }
    else //apply ILUT directly:
    {
      viennacl::linalg::inplace_solve(LU_, vec, unit_lower_tag());
      viennacl::linalg::inplace_solve(LU_, vec, upper_tag());
    }
  }
Exemple #17
0
 static void apply(::viennacl::vector<V> &x)
 {
     x.clear();
 }
 static bool same_size( const viennacl::vector< T > &x1 , const viennacl::vector< T > &x2 )
 {
     return x1.size() == x2.size();
 }
 static void resize( viennacl::vector< T > &x1 , const viennacl::vector< T > &x2 )
 {
     x1.resize( x2.size() , false );
 }
void fill_vector(viennacl::vector<NumericT> & vec)
{
  for (std::size_t i = 0; i < vec.size(); ++i)
    vec(i) = NumericT(1.0) + NumericT(2.0) * random<NumericT>(); //some extra weight on diagonal for stability
}
Exemple #21
0
void clear(viennacl::vector<ScalarType, AlignmentV> & vec)
{
  vec.clear();
}
Exemple #22
0
void clear(viennacl::vector<ScalarType, ALIGNMENT> & vec)
{
    vec.clear();
}
Exemple #23
0
    viennacl::vector<ScalarType> solve(MatrixType const & A, //MatrixType const & A,
                                       viennacl::vector<ScalarType> const & rhs,
                                       bicgstab_tag const & tag,
                                       viennacl::linalg::no_precond)
    {
      viennacl::vector<ScalarType> result = viennacl::zero_vector<ScalarType>(rhs.size(), viennacl::traits::context(rhs));

      viennacl::vector<ScalarType> residual = rhs;
      viennacl::vector<ScalarType> p = rhs;
      viennacl::vector<ScalarType> r0star = rhs;
      viennacl::vector<ScalarType> Ap = rhs;
      viennacl::vector<ScalarType> s  = rhs;
      viennacl::vector<ScalarType> As = rhs;

      // Layout of temporary buffer:
      //  chunk 0: <residual, r_0^*>
      //  chunk 1: <As, As>
      //  chunk 2: <As, s>
      //  chunk 3: <Ap, r_0^*>
      //  chunk 4: <As, r_0^*>
      //  chunk 5: <s, s>
      vcl_size_t buffer_size_per_vector = 256;
      vcl_size_t num_buffer_chunks = 6;
      viennacl::vector<ScalarType> inner_prod_buffer = viennacl::zero_vector<ScalarType>(num_buffer_chunks*buffer_size_per_vector, viennacl::traits::context(rhs)); // temporary buffer
      std::vector<ScalarType>      host_inner_prod_buffer(inner_prod_buffer.size());

      ScalarType norm_rhs_host = viennacl::linalg::norm_2(residual);
      ScalarType beta;
      ScalarType alpha;
      ScalarType omega;
      ScalarType residual_norm = norm_rhs_host;
      inner_prod_buffer[0] = norm_rhs_host * norm_rhs_host;

      ScalarType  r_dot_r0 = 0;
      ScalarType As_dot_As = 0;
      ScalarType As_dot_s  = 0;
      ScalarType Ap_dot_r0 = 0;
      ScalarType As_dot_r0 = 0;
      ScalarType  s_dot_s  = 0;

      if (norm_rhs_host <= 0) //solution is zero if RHS norm is zero
        return result;

      for (vcl_size_t i = 0; i < tag.max_iterations(); ++i)
      {
        tag.iters(i+1);
        // Ap = A*p_j
        // Ap_dot_r0 = <Ap, r_0^*>
        viennacl::linalg::pipelined_bicgstab_prod(A, p, Ap, r0star,
                                                  inner_prod_buffer, buffer_size_per_vector, 3*buffer_size_per_vector);

        //////// first (weak) synchronization point ////

        ///// method 1: compute alpha on host:
        //
        //// we only need the second chunk of the buffer for computing Ap_dot_r0:
        //viennacl::fast_copy(inner_prod_buffer.begin(), inner_prod_buffer.end(), host_inner_prod_buffer.begin());
        //Ap_dot_r0 = std::accumulate(host_inner_prod_buffer.begin() +     buffer_size_per_vector, host_inner_prod_buffer.begin() + 2 * buffer_size_per_vector, ScalarType(0));

        //alpha = residual_dot_r0 / Ap_dot_r0;

        //// s_j = r_j - alpha_j q_j
        //s = residual - alpha * Ap;

        ///// method 2: compute alpha on device:
        // s = r - alpha * Ap
        // <s, s> first stage
        // dump alpha at end of inner_prod_buffer
        viennacl::linalg::pipelined_bicgstab_update_s(s, residual, Ap,
                                                      inner_prod_buffer, buffer_size_per_vector, 5*buffer_size_per_vector);

        // As = A*s_j
        // As_dot_As = <As, As>
        // As_dot_s  = <As, s>
        // As_dot_r0 = <As, r_0^*>
        viennacl::linalg::pipelined_bicgstab_prod(A, s, As, r0star,
                                                  inner_prod_buffer, buffer_size_per_vector, 4*buffer_size_per_vector);

        //////// second (strong) synchronization point ////

        viennacl::fast_copy(inner_prod_buffer.begin(), inner_prod_buffer.end(), host_inner_prod_buffer.begin());

         r_dot_r0 = std::accumulate(host_inner_prod_buffer.begin(),                              host_inner_prod_buffer.begin() +     buffer_size_per_vector, ScalarType(0));
        As_dot_As = std::accumulate(host_inner_prod_buffer.begin() +     buffer_size_per_vector, host_inner_prod_buffer.begin() + 2 * buffer_size_per_vector, ScalarType(0));
        As_dot_s  = std::accumulate(host_inner_prod_buffer.begin() + 2 * buffer_size_per_vector, host_inner_prod_buffer.begin() + 3 * buffer_size_per_vector, ScalarType(0));
        Ap_dot_r0 = std::accumulate(host_inner_prod_buffer.begin() + 3 * buffer_size_per_vector, host_inner_prod_buffer.begin() + 4 * buffer_size_per_vector, ScalarType(0));
        As_dot_r0 = std::accumulate(host_inner_prod_buffer.begin() + 4 * buffer_size_per_vector, host_inner_prod_buffer.begin() + 5 * buffer_size_per_vector, ScalarType(0));
         s_dot_s  = std::accumulate(host_inner_prod_buffer.begin() + 5 * buffer_size_per_vector, host_inner_prod_buffer.begin() + 6 * buffer_size_per_vector, ScalarType(0));

        alpha =         r_dot_r0 / Ap_dot_r0;
        beta  = -1.0 * As_dot_r0 / Ap_dot_r0;
        omega =        As_dot_s  / As_dot_As;

        residual_norm = std::sqrt(s_dot_s - 2.0 * omega * As_dot_s + omega * omega *  As_dot_As);
        if (std::fabs(residual_norm / norm_rhs_host) < tag.tolerance())
          break;

        // x_{j+1} = x_j + alpha * p_j + omega * s_j
        // r_{j+1} = s_j - omega * t_j
        // p_{j+1} = r_{j+1} + beta * (p_j - omega * q_j)
        // and compute first stage of r_dot_r0 = <r_{j+1}, r_o^*> for use in next iteration
         viennacl::linalg::pipelined_bicgstab_vector_update(result, alpha, p, omega, s,
                                                            residual, As,
                                                            beta, Ap,
                                                            r0star, inner_prod_buffer, buffer_size_per_vector);
      }

      //store last error estimate:
      tag.error(residual_norm / norm_rhs_host);

      return result;
    }
Exemple #24
0
 static size_t size(const viennacl::vector<ScalarType, A> & lhs,
                    const RHS & rhs) { return lhs.size(); }
 static size_t size2(viennacl::vector<ScalarType, A1> & lhs,
                     viennacl::vector<ScalarType, A2> & rhs) { return rhs.size2(); }
Exemple #26
0
viennacl::vector<unsigned int> bucket_select(int N, const viennacl::vector<basic_type>& in)
{
	viennacl::vector<unsigned int> src(in.size(), viennacl::traits::context(in));
	viennacl::vector<unsigned int> dst(in.size() , viennacl::traits::context(in));

	// load kernels
	static bool init = false;
	static int num_groups = 1;
	static int wg_size = 128;
	

	if (!init)
	{
		FILE * tmp = fopen("bucket_select.cl", "rb");
		fseek(tmp, 0, SEEK_END);
		std::vector<char> binary;
		binary.resize(ftell(tmp));
		rewind(tmp);
		fread(&binary[0], binary.size(), 1, tmp);
		fclose(tmp);
		binary.push_back(0);
		static viennacl::context g_context = viennacl::ocl::current_context();
		static bool init = false;
		viennacl::ocl::context* ctx = g_context.opencl_pcontext();
		std::cout << "Device " << ctx->current_device().name() << std::endl;
		ctx->build_options("-cl-std=CL2.0 -D CL_VERSION_2_0");
		std::string program_text(&binary[0]);
		ctx->add_program(program_text, std::string("test"));
		init = true;
	}


	viennacl::ocl::kernel scan_kernel = viennacl::ocl::current_context().get_kernel("test", "scan_buckets");
	viennacl::ocl::kernel scatter_kernel = viennacl::ocl::current_context().get_kernel("test", "scatter_buckets");
	viennacl::ocl::kernel init_offsets_kernel = viennacl::ocl::current_context().get_kernel("test", "init_offsets");

	scan_kernel.local_work_size(0, wg_size);
	scan_kernel.global_work_size(0, wg_size * num_groups);

	scatter_kernel.local_work_size(0, wg_size);
	scatter_kernel.global_work_size(0, wg_size * num_groups);

	init_offsets_kernel.local_work_size(0, wg_size);
	init_offsets_kernel.global_work_size(0, wg_size* num_groups);
	cl_uint size = src.size();
	viennacl::ocl::enqueue(init_offsets_kernel(size, src));

	int position = 0;
	viennacl::vector<unsigned int> result(N, viennacl::traits::context(in));

	int num_buckets = 10;
	viennacl::vector<unsigned int> global_histogram((num_buckets + 1) * num_groups, viennacl::traits::context(in)); // -wg_size
	viennacl::vector<unsigned int> global_histogram_prefix((num_buckets + 1) * num_groups  + 1, viennacl::traits::context(in));
	std::vector< unsigned int > global_histogram_cpu((num_buckets + 1) * num_groups + 1);
	int scan_start = 0;
	int scan_end = in.size();
	basic_type pivot;
	basic_type base_value;
	int split_bucket = 0;
	base_value = 0;
	pivot = std::numeric_limits<basic_type>::max() / num_buckets;
	assert(pivot > 0);
	while (position < N)
	{
		int main = (scan_end / wg_size) * wg_size; // floor to multiple wg size
		int loop_end = main == scan_end ? main : main + wg_size; // add wg size if needed

		viennacl::ocl::enqueue(scan_kernel(in,
			src,
			scan_end,
			loop_end,
			viennacl::ocl::local_mem(sizeof(cl_uint) *wg_size),
			base_value,
			pivot,
			num_buckets,
			global_histogram));

		viennacl::linalg::exclusive_scan(global_histogram, global_histogram_prefix);
		viennacl::copy(global_histogram_prefix, global_histogram_cpu);
		global_histogram_cpu[global_histogram_cpu.size() - 1] = global_histogram_cpu[global_histogram_cpu.size() - 2]; // fix last element

		for (split_bucket = 1; split_bucket < num_buckets; ++split_bucket)
		{
			int offset = global_histogram_cpu[num_groups * split_bucket];
			if (offset >= N)
				break;
		}
		viennacl::ocl::enqueue(scatter_kernel(
			in,
			src,
			scan_end,
			loop_end,
			viennacl::ocl::local_mem(sizeof(cl_uint) *wg_size),
			(basic_type)base_value,
			(basic_type)pivot,
			num_buckets,
			split_bucket,
			global_histogram_prefix,
			dst
			));

		int hist_max = global_histogram_cpu[num_groups * split_bucket];
		int hist_min = global_histogram_cpu[num_groups * (split_bucket - 1)];
		//#ifdef DEBUG_RADIX_SELECT
		std::vector<unsigned int> dst_cpu(in.size());
		std::vector<unsigned int> src_cpu(in.size());
		viennacl::copy(dst, dst_cpu);
		viennacl::copy(src, src_cpu);
		//#endif

		if (hist_max == N)
			break;
		if (hist_max> N && hist_min < N)
		{
			scan_start = global_histogram_cpu[num_groups * (split_bucket - 1)];
			scan_end = global_histogram_cpu[num_groups * split_bucket];
			if (scan_start > 0)
			{
				viennacl::copy(dst.begin(), dst.begin() + scan_start, result.begin() + position);
				position += scan_start;
			}
			//#ifdef DEBUG_RADIX_SELECT
			std::vector<unsigned int> result_cpu(in.size());
			viennacl::copy(result, result_cpu);
			//#endif
			if (position >= N)
				break;
			if (scan_end == dst.size() && scan_start == 0)
				dst.fast_swap(src);
			else
				viennacl::copy(dst.begin() + scan_start, dst.begin() + scan_end, src.begin());
			scan_end -= scan_start;
		}

		base_value += pivot * (split_bucket-1);
		// update pivot
		
		pivot = pivot / num_buckets;
		if (pivot == 0)
			break;


	}
	if (position <N)
		viennacl::copy(dst.begin(), dst.begin() + (N - position), result.begin() + position);

	return result;
}