Esempio n. 1
0
inline void test_fill(T v1, T v2, T v3, bc::command_queue queue) {
    if(boost::is_same<typename bc::scalar_type<T>::type, bc::double_>::value &&
       !queue.get_device().supports_extension("cl_khr_fp64")) {
        std::cerr << "Skipping test_fill<" << bc::type_name<T>() << ">() "
                     "on device which doesn't support cl_khr_fp64" << std::endl;
        return;
    }

    bc::vector<T> vector(4, queue.get_context());
    bc::fill(vector.begin(), vector.end(), v1, queue);
    queue.finish();
    CHECK_RANGE_EQUAL(T, 4, vector, (v1, v1, v1, v1));

    vector.resize(1000, queue);
    bc::fill(vector.begin(), vector.end(), v2, queue);
    queue.finish();
    BOOST_CHECK_EQUAL(vector.front(), v2);
    BOOST_CHECK_EQUAL(vector.back(), v2);

    bc::fill(vector.begin() + 500, vector.end(), v3, queue);
    queue.finish();
    BOOST_CHECK_EQUAL(vector.front(), v2);
    BOOST_CHECK_EQUAL(vector[499], v2);
    BOOST_CHECK_EQUAL(vector[500], v3);
    BOOST_CHECK_EQUAL(vector.back(), v3);
}
Esempio n. 2
0
				static decltype(auto) call(std::vector<neu::layer::any_layer>& layers,
						InputRange const& initial_delta,
						OutputRange& result_prev_delta,
						boost::compute::command_queue& queue) {
					gpu_vector delta(initial_delta.begin(), initial_delta.end(), queue);
					gpu_vector prev_delta(queue.get_context());

					for(int i = layers.size()-1; i >= 0; --i) {
						auto& l = layers.at(i);
						prev_delta.resize(::neu::layer::whole_input_size(l), queue);
						auto prev_delta_range = range::to_range(prev_delta);
#ifdef NEU_BENCHMARK_ENABLE
						boost::timer t;
#endif //NEU_BENCHMARK_ENABLE
						l.backward(
							range::to_range(delta), prev_delta_range,
							queue);
#ifdef NEU_BENCHMARK_ENABLE
						queue.finish();
						std::cout << "layer" << i << "\tbackward\t" << t.elapsed() << " secs" << std::endl;
#endif //NEU_BENCHMARK_ENABLE
						delta.swap(prev_delta);
					}
					range::copy(delta, result_prev_delta, queue);
				}
Esempio n. 3
0
			decltype(auto) matrix_transpose(
					InputRange const& input, OutputRange& output,
					int row_size, int col_size,
					boost::compute::command_queue& queue) {
				NEU_ASSERT(row_size*col_size == range::distance(input));
				static auto transpose_kernel =
					neu::make_kernel(neu::layer::impl::matrix_transpose_kernel_source,
					"matrix_transpose", queue.get_context());
				transpose_kernel.set_args(
					range::get_buffer(input),
					static_cast<cl_int>(range::get_begin_index(input)),
					range::get_buffer(output),
					static_cast<cl_int>(range::get_begin_index(output)),
					static_cast<cl_int>(row_size),
					static_cast<cl_int>(col_size));
				std::size_t global[2] = {
					static_cast<std::size_t>(((col_size-1)/32+1)*32),
					static_cast<std::size_t>(((row_size-1)/32+1)*32)
				};
				std::size_t local[2] = {
					static_cast<std::size_t>(32),
					static_cast<std::size_t>(32)
				};
				queue.enqueue_nd_range_kernel(transpose_kernel, 2, nullptr, global, local);
			}
void perf_random_number_engine(const size_t size,
                               const size_t trials,
                               compute::command_queue& queue)
{
    typedef typename Engine::result_type T;

    // create random number engine
    Engine engine(queue);

    // create vector on the device
    std::cout << "size = " << size << std::endl;
    compute::vector<T> vector(size, queue.get_context());

    // generate random numbers
    perf_timer t;
    for(size_t i = 0; i < trials; i++){
        t.start();
        engine.generate(vector.begin(), vector.end(), queue);
        queue.finish();
        t.stop();
    }

    // print result
    std::cout << "time: " << t.min_time() / 1e6 << " ms" << std::endl;
    std::cout << "rate: " << perf_rate<T>(size, t.min_time()) << " MB/s" << std::endl;
}
Esempio n. 5
0
				static decltype(auto) call(
						std::vector<neu::layer::any_layer>& layers,
						int batch_size,
						InputRange const& initial_input, OutputRange& result_output,
						boost::compute::command_queue& queue) {
					gpu_vector input(initial_input.begin(), initial_input.end(), queue);
					gpu_vector output(queue.get_context());
					int i = 0;
					for(auto& l : layers) {
						output.resize(::neu::layer::output_dim(l)*batch_size, queue);
						/*
						std::cout << "whole" << ::neu::layer::whole_output_size(l) << std::endl;
						std::cout << "i" << i << std::endl;
						std::cout << "aa" << output.size() << std::endl;
						*/
						auto output_range = range::to_range(output);
#ifdef NEU_BENCHMARK_ENABLE
						boost::timer t;
#endif //NEU_BENCHMARK_ENABLE
						l.test_forward(batch_size,
							range::to_range(input), output_range, queue);
#ifdef NEU_BENCHMARK_ENABLE
						queue.finish();
						std::cout << "layer" << i << "\ttest_forward\t" << t.elapsed() << " secs" << std::endl;
#endif //NEU_BENCHMARK_ENABLE
						input.swap(output);
						++i;
					}
					range::copy(input, result_output, queue);
				}
Esempio n. 6
0
inline void test_fill_n(T v1, T v2, T v3, bc::command_queue queue) {
    if(boost::is_same<typename bc::scalar_type<T>::type, bc::double_>::value &&
       !queue.get_device().supports_extension("cl_khr_fp64")) {
        std::cerr << "Skipping test_fill_n<" << bc::type_name<T>() << ">() "
                     "on device which doesn't support cl_khr_fp64" << std::endl;
        return;
    }

    bc::vector<T> vector(4, queue.get_context());
    bc::fill_n(vector.begin(), 4, v1, queue);
    queue.finish();
    CHECK_RANGE_EQUAL(T, 4, vector, (v1, v1, v1, v1));

    bc::fill_n(vector.begin(), 3, v2, queue);
    queue.finish();
    CHECK_RANGE_EQUAL(T, 4, vector, (v2, v2, v2, v1));

    bc::fill_n(vector.begin() + 1, 2, v3, queue);
    queue.finish();
    CHECK_RANGE_EQUAL(T, 4, vector, (v2, v3, v3, v1));

    bc::fill_n(vector.begin(), 4, v2, queue);
    queue.finish();
    CHECK_RANGE_EQUAL(T, 4, vector, (v2, v2, v2, v2));

    // fill last element
    bc::fill_n(vector.end() - 1, 1, v3, queue);
    queue.finish();
    CHECK_RANGE_EQUAL(T, 4, vector, (v2, v2, v2, v3));

    // fill first element
    bc::fill_n(vector.begin(), 1, v1, queue);
    queue.finish();
    CHECK_RANGE_EQUAL(T, 4, vector, (v1, v2, v2, v3));
}
Esempio n. 7
0
 void read(boost::compute::command_queue q, size_t offset,
         size_t size, T *host, bool blocking = false
         ) const
 {
     if (size) {
         if (blocking) {
             q.enqueue_read_buffer(
                     buffer, sizeof(T) * offset, sizeof(T) * size, host
                     );
         } else {
             q.enqueue_read_buffer_async(
                     buffer, sizeof(T) * offset, sizeof(T) * size, host
                     );
         }
     }
 }
Esempio n. 8
0
inline void test_fill_n(T v1, T v2, T v3, bc::command_queue queue) {
    bc::vector<T> vector(4, queue.get_context());
    bc::fill_n(vector.begin(), 4, v1, queue);
    queue.finish();
    CHECK_RANGE_EQUAL(T, 4, vector, (v1, v1, v1, v1));

    bc::fill_n(vector.begin(), 3, v2, queue);
    queue.finish();
    CHECK_RANGE_EQUAL(T, 4, vector, (v2, v2, v2, v1));

    bc::fill_n(vector.begin() + 1, 2, v3, queue);
    queue.finish();
    CHECK_RANGE_EQUAL(T, 4, vector, (v2, v3, v3, v1));

    bc::fill_n(vector.begin(), 4, v2, queue);
    queue.finish();
    CHECK_RANGE_EQUAL(T, 4, vector, (v2, v2, v2, v2));

    // fill last element
    bc::fill_n(vector.end() - 1, 1, v3, queue);
    queue.finish();
    CHECK_RANGE_EQUAL(T, 4, vector, (v2, v2, v2, v3));

    // fill first element
    bc::fill_n(vector.begin(), 1, v1, queue);
    queue.finish();
    CHECK_RANGE_EQUAL(T, 4, vector, (v1, v2, v2, v3));
}
Esempio n. 9
0
inline void test_fill(T v1, T v2, T v3, bc::command_queue queue) {
    bc::vector<T> vector(4, queue.get_context());
    bc::fill(vector.begin(), vector.end(), v1, queue);
    queue.finish();
    CHECK_RANGE_EQUAL(T, 4, vector, (v1, v1, v1, v1));

    vector.resize(1000, queue);
    bc::fill(vector.begin(), vector.end(), v2, queue);
    queue.finish();
    BOOST_CHECK_EQUAL(vector.front(), v2);
    BOOST_CHECK_EQUAL(vector.back(), v2);

    bc::fill(vector.begin() + 500, vector.end(), v3, queue);
    queue.finish();
    BOOST_CHECK_EQUAL(vector.front(), v2);
    BOOST_CHECK_EQUAL(vector[499], v2);
    BOOST_CHECK_EQUAL(vector[500], v3);
    BOOST_CHECK_EQUAL(vector.back(), v3);
}
Esempio n. 10
0
        device_vector(const boost::compute::command_queue &q, size_t n,
                const T *host = 0, mem_flags flags = MEM_READ_WRITE)
        {
            if (host && !(flags & CL_MEM_USE_HOST_PTR))
                flags |= CL_MEM_COPY_HOST_PTR;

            if (n)
                buffer = boost::compute::buffer(q.get_context(), n * sizeof(T),
                        flags, static_cast<void*>(const_cast<T*>(host)));
        }
    void generate(OutputIterator first, OutputIterator last, Function op, boost::compute::command_queue &queue)
    {
        boost::compute::vector<T> tmp(std::distance(first, last), queue.get_context());

        BOOST_COMPUTE_FUNCTION(T, max_random, (const T x),
        {
            if(get_global_id(0) < 1)
                return (ValueType) MAX_RANDOM;
            else
                return (ValueType) 0;
        });
Esempio n. 12
0
 mapped_array map(boost::compute::command_queue q) const {
     return mapped_array(
             static_cast<T*>(
                 q.enqueue_map_buffer(
                     buffer, CL_MAP_READ,
                     0, size() * sizeof(T)
                     )
                 ),
             buffer_unmapper(q, buffer)
             );
 }
Esempio n. 13
0
    void saxpy(const int num, bool gen = true, int iter = 0)
    {
        static compute::device gpu;
        static compute::context context;
        static compute::command_queue queue;
        static compute::vector<T> x;
        static compute::vector<T> y;
        static compute::vector<T> res;
        static T alpha = 3.5;

        using compute::lambda::_1;
        using compute::lambda::_2;

        if (gen) {
            gpu = compute::system::default_device();
            context = compute::context(gpu);
            queue = compute::command_queue(context, gpu);

            x = compute::vector<T>(num, context);
            std::vector<T> h_x(num);
            std::generate(h_x.begin(), h_x.end(), rand);
            compute::copy(h_x.begin(), h_x.end(), x.begin(), queue);

            y = compute::vector<T>(num, context);
            std::vector<T> h_y(num);
            std::generate(h_y.begin(), h_y.end(), rand);
            compute::copy(h_y.begin(), h_y.end(), y.begin(), queue);

            res = compute::vector<T>(num, context);
            queue.finish();
        }

        for (int i = 0; i < iter; i++) {
            compute::transform(x.begin(), x.end(),
                               y.begin(), res.begin(),
                               alpha * _1 + _2,
                               queue);
        }

        queue.finish();
    }
Esempio n. 14
0
double perf_accumulate(const compute::vector<T>& data,
                       const size_t trials,
                       compute::command_queue& queue)
{
    perf_timer t;
    for(size_t trial = 0; trial < trials; trial++){
        t.start();
        compute::accumulate(data.begin(), data.end(), T(0), queue);
        queue.finish();
        t.stop();
    }
    return t.min_time();
}
Esempio n. 15
0
				static decltype(auto) call(std::vector<neu::layer::any_layer>& layers,
						boost::compute::command_queue& queue) {
					int i = 0;
					for(auto& l : layers) {
#ifdef NEU_BENCHMARK_ENABLE
						boost::timer t;
#endif //NEU_BENCHMARK_ENABLE
						l.update(queue);
#ifdef NEU_BENCHMARK_ENABLE
						queue.finish();
						std::cout << "layer" << i << "\tupdate\t" << t.elapsed() << " secs" << std::endl;
#endif //NEU_BENCHMARK_ENABLE
						++i;
					}
				}
Esempio n. 16
0
void test_copy_if_odd(compute::command_queue &queue)
{
    // create input and output vectors on the device
    const compute::context &context = queue.get_context();
    compute::vector<int> input(PERF_N, context);
    compute::vector<int> output(PERF_N, context);

    // generate random numbers between 1 and 10
    compute::default_random_engine rng(queue);
    compute::uniform_int_distribution<int> d(1, 10);
    d.generate(input.begin(), input.end(), rng, queue);

    BOOST_COMPUTE_FUNCTION(bool, is_odd, (int x),
    {
        return x & 1;
    });
Esempio n. 17
0
// tesselates a sphere with radius, phi_slices, and theta_slices. returns
// a shared opencl/opengl buffer containing the vertex data.
compute::opengl_buffer tesselate_sphere(float radius,
                                        size_t phi_slices,
                                        size_t theta_slices,
                                        compute::command_queue &queue)
{
    using compute::dim;

    const compute::context &context = queue.get_context();

    const size_t vertex_count = phi_slices * theta_slices;

    // create opengl buffer
    GLuint vbo;
    vtkgl::GenBuffersARB(1, &vbo);
    vtkgl::BindBufferARB(vtkgl::ARRAY_BUFFER, vbo);
    vtkgl::BufferDataARB(vtkgl::ARRAY_BUFFER,
                         sizeof(float) * 4 * vertex_count,
                         NULL,
                         vtkgl::STREAM_DRAW);
    vtkgl::BindBufferARB(vtkgl::ARRAY_BUFFER, 0);

    // create shared opengl/opencl buffer
    compute::opengl_buffer vertex_buffer(context, vbo);

    // tesselate_sphere kernel source
    const char source[] = BOOST_COMPUTE_STRINGIZE_SOURCE(
        __kernel void tesselate_sphere(float radius,
                                       uint phi_slices,
                                       uint theta_slices,
                                       __global float4 *vertex_buffer)
        {
            const uint phi_i = get_global_id(0);
            const uint theta_i = get_global_id(1);

            const float phi = phi_i * 2.f * M_PI_F / phi_slices;
            const float theta = theta_i * 2.f * M_PI_F / theta_slices;

            float4 v;
            v.x = radius * cos(theta) * cos(phi);
            v.y = radius * cos(theta) * sin(phi);
            v.z = radius * sin(theta);
            v.w = 1.f;

            vertex_buffer[phi_i*phi_slices+theta_i] = v;
        }
    );
Esempio n. 18
0
/**
 * If VEXCL_CACHE_KERNELS macro is defined, then program binaries are cached
 * in filesystem and reused in the following runs.
 */
inline boost::compute::program build_sources(
    const boost::compute::command_queue &queue,
    const std::string &source,
    const std::string &options = ""
)
{
#ifdef VEXCL_SHOW_KERNELS
    std::cout << source << std::endl;
#else
    if (getenv("VEXCL_SHOW_KERNELS"))
        std::cout << source << std::endl;
#endif

    return boost::compute::program::build_with_source(
               source, queue.get_context(),
               options + " " + get_compile_options(queue)
           );
}
Esempio n. 19
0
inline void box_filter_image(const compute::image2d &input,
                             compute::image2d &output,
                             compute::uint_ box_height,
                             compute::uint_ box_width,
                             compute::command_queue &queue)
{
    using compute::dim;

    const compute::context &context = queue.get_context();

    // simple box filter kernel source
    const char source[] = BOOST_COMPUTE_STRINGIZE_SOURCE(
        __kernel void box_filter(__read_only image2d_t input,
                                 __write_only image2d_t output,
                                 uint box_height,
                                 uint box_width)
        {
            int x = get_global_id(0);
            int y = get_global_id(1);
            int h = get_image_height(input);
            int w = get_image_width(input);
            int k = box_width;
            int l = box_height;

            if(x < k/2 || y < l/2 || x >= w-(k/2) || y >= h-(l/2)){
                write_imagef(output, (int2)(x, y), (float4)(0, 0, 0, 1));
            }
            else {
                const sampler_t sampler = CLK_ADDRESS_NONE | CLK_FILTER_NEAREST;

                float4 sum = { 0, 0, 0, 0 };
                for(int i = 0; i < k; i++){
                    for(int j = 0; j < l; j++){
                        sum += read_imagef(input, sampler, (int2)(x+i-k, y+j-l));
                    }
                }
                sum /= (float) k * l;
                float4 value = (float4)( sum.x, sum.y, sum.z, 1.f );
                write_imagef(output, (int2)(x, y), value);
            }
        }
Esempio n. 20
0
        /// Select best launch configuration for the given shared memory requirements.
        void config(const boost::compute::command_queue &queue, std::function<size_t(size_t)> smem) {
            boost::compute::device dev = queue.get_device();

            size_t ws;

            if ( is_cpu(queue) ) {
                ws = 1;
            } else {
                // Select workgroup size that would fit into the device.
                ws = dev.get_info<std::vector<size_t>>(CL_DEVICE_MAX_WORK_ITEM_SIZES)[0] / 2;

                size_t max_ws   = max_threads_per_block(queue);
                size_t max_smem = max_shared_memory_per_block(queue);

                // Reduce workgroup size until it satisfies resource requirements:
                while( (ws > max_ws) || (smem(ws) > max_smem) )
                    ws /= 2;
            }

            config(num_workgroups(queue), ws);
        }
Esempio n. 21
0
void tune_accumulate(const compute::vector<T>& data,
                     const size_t trials,
                     compute::command_queue& queue)
{
    boost::shared_ptr<compute::detail::parameter_cache>
        params = compute::detail::parameter_cache::get_global_cache(queue.get_device());

    const std::string cache_key =
        std::string("__boost_reduce_on_gpu_") + compute::type_name<T>();

    const compute::uint_ tpbs[] = { 4, 8, 16, 32, 64, 128, 256, 512, 1024 };
    const compute::uint_ vpts[] = { 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 };

    double min_time = std::numeric_limits<double>::max();
    compute::uint_ best_tpb = 0;
    compute::uint_ best_vpt = 0;

    for(size_t i = 0; i < sizeof(tpbs) / sizeof(*tpbs); i++){
        params->set(cache_key, "tpb", tpbs[i]);
        for(size_t j = 0; j < sizeof(vpts) / sizeof(*vpts); j++){
            params->set(cache_key, "vpt", vpts[j]);

            try {
                const double t = perf_accumulate(data, trials, queue);
                if(t < min_time){
                    best_tpb = tpbs[i];
                    best_vpt = vpts[j];
                    min_time = t;
                }
            }
            catch(compute::opencl_error&){
                // invalid parameters for this device, skip
            }
        }
    }

    // store optimal parameters
    params->set(cache_key, "tpb", best_tpb);
    params->set(cache_key, "vpt", best_vpt);
}
Esempio n. 22
0
	vector(vector_expression<E, gpu_tag> const& e, boost::compute::command_queue& queue)
	: m_storage(e().size(), queue.get_context())
	, m_queue(&queue){
		assign(*this, e);
	}
Esempio n. 23
0
void medianFilter2D_wrapper(compute::command_queue queue,boost::compute::program foo_program,compute::buffer gpu_in,compute::buffer gpu_out,compute::buffer gpu_histogram,int heightImage,int widthImage,int implementation)
{
  try{
  boost::compute::kernel foo_kernel;
switch(implementation)
{
  case 1:
    std::cout<<"running naive median filter"<<std::endl;
    foo_kernel = foo_program.create_kernel("MedianFilter2D");
    break;
  case 2:
    std::cout<<"running histogram median filter"<<std::endl;
    foo_kernel = foo_program.create_kernel("MedianFilter2D_histogram");
    break;  
  case 3:
    std::cout<<"running median filter with partial selection"<<std::endl;
    foo_kernel = foo_program.create_kernel("MedianFilter2D_partial");
    break;
  case 4:
    std::cout<<"running median filter with forgetful selection"<<std::endl;
   foo_kernel = foo_program.create_kernel("MedianFilter2D_forgetful");
    break;
  case 5:
    std::cout<<"running median filter with fast histogram"<<std::endl;
   foo_kernel = foo_program.create_kernel("histogram2d");
    break;
  
}   

if(implementation!=5)
{
         // TODO these are the arguments for the first kernel
    foo_kernel.set_arg(0,gpu_in);
    foo_kernel.set_arg(1,gpu_out);
    foo_kernel.set_arg(2,sizeof(int),&widthImage);
    foo_kernel.set_arg(3,sizeof(int),&heightImage);
//     foo_kernel.set_arg(4,sizeof(unsigned int),&window_size);

        // Launch kernel

   
   const size_t offset[] = { 0, 0 };
   const size_t bounds[] = { heightImage, widthImage };
    timer kernel_timer1;
    queue.enqueue_nd_range_kernel(foo_kernel, 2, 
                              offset, 
                              bounds, 
                              0);
        double time_elapsed1=kernel_timer1.elapsed();
    printf("total time elapsed for the kernel implementation %d is %f \n",implementation,time_elapsed1);
}
else
{
      foo_kernel.set_arg(0,gpu_in);
    foo_kernel.set_arg(1,gpu_out);
    foo_kernel.set_arg(2,gpu_histogram);
    foo_kernel.set_arg(3,sizeof(int),&widthImage);
    foo_kernel.set_arg(4,sizeof(int),&heightImage);
//     foo_kernel.set_arg(4,sizeof(unsigned int),&window_size);

        // Launch kernel 
    timer kernel_timer;
    queue.enqueue_1d_range_kernel(foo_kernel, 0,heightImage, 0);
    double time_elapsed=kernel_timer.elapsed();
    printf("total time elapsed for the kernel implementation %d is %f \n",implementation,time_elapsed);
}
  }
  catch(boost::compute::opencl_error &e){
  
    std::cout<<"something went wrong with kernel execution"<<std::endl;
  }
}
Esempio n. 24
0
 size_t preferred_work_group_size_multiple(const boost::compute::command_queue &q) const {
     return K.get_work_group_info<size_t>(q.get_device(), CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE);
 }
Esempio n. 25
0
        /// The size in bytes of shared memory per block available for this kernel.
        size_t max_shared_memory_per_block(const boost::compute::command_queue &q) const {
            boost::compute::device d = q.get_device();

            return d.local_memory_size() - K.get_work_group_info<cl_ulong>(d, CL_KERNEL_LOCAL_MEM_SIZE);
        }
Esempio n. 26
0
 /// The maximum number of threads per block, beyond which a launch of the kernel would fail.
 size_t max_threads_per_block(const boost::compute::command_queue &q) const {
     return K.get_work_group_info<size_t>(q.get_device(), CL_KERNEL_WORK_GROUP_SIZE);
 }
Esempio n. 27
0
 /// Standard number of workgroups to launch on a device.
 static inline size_t num_workgroups(const boost::compute::command_queue &q) {
     // This is a simple heuristic-based estimate. More advanced technique may
     // be employed later.
     return 8 * q.get_device().compute_units();
 }
Esempio n. 28
0
	/// \brief Constructor of a vector with a default queue
	///
	///note that for all operations for which vector is on the left hand side,
	///the kernels are enqueued on the supplied queue in case of a multi-queue setup.
	vector(boost::compute::command_queue& queue = boost::compute::system::default_queue())
	:m_storage(queue.get_context()), m_queue(&queue){}
Esempio n. 29
0
 /// Enqueue the kernel to the specified command queue.
 void operator()(boost::compute::command_queue q) {
     q.enqueue_nd_range_kernel(K, 3, NULL, g_size.dim, w_size.dim);
     argpos = 0;
 }
Esempio n. 30
0
	/// \brief Constructor of a vector with a predefined size
	/// By default, its elements are uninitialized.
	/// \param size initial size of the vector
	/// \param queue the opencl queue to use by this vector
	explicit vector(size_type size, boost::compute::command_queue& queue = boost::compute::system::default_queue())
	: m_storage(size,queue.get_context()), m_queue(&queue){}