inline void test_fill(T v1, T v2, T v3, bc::command_queue queue) {
    if(boost::is_same<typename bc::scalar_type<T>::type, bc::double_>::value &&
       !queue.get_device().supports_extension("cl_khr_fp64")) {
        std::cerr << "Skipping test_fill<" << bc::type_name<T>() << ">() "
                     "on device which doesn't support cl_khr_fp64" << std::endl;
        return;
    }

    bc::vector<T> vector(4, queue.get_context());
    bc::fill(vector.begin(), vector.end(), v1, queue);
    queue.finish();
    CHECK_RANGE_EQUAL(T, 4, vector, (v1, v1, v1, v1));

    vector.resize(1000, queue);
    bc::fill(vector.begin(), vector.end(), v2, queue);
    queue.finish();
    BOOST_CHECK_EQUAL(vector.front(), v2);
    BOOST_CHECK_EQUAL(vector.back(), v2);

    bc::fill(vector.begin() + 500, vector.end(), v3, queue);
    queue.finish();
    BOOST_CHECK_EQUAL(vector.front(), v2);
    BOOST_CHECK_EQUAL(vector[499], v2);
    BOOST_CHECK_EQUAL(vector[500], v3);
    BOOST_CHECK_EQUAL(vector.back(), v3);
}
Exemple #2
0
				static decltype(auto) call(
						std::vector<neu::layer::any_layer>& layers,
						int batch_size,
						InputRange const& initial_input, OutputRange& result_output,
						boost::compute::command_queue& queue) {
					gpu_vector input(initial_input.begin(), initial_input.end(), queue);
					gpu_vector output(queue.get_context());
					int i = 0;
					for(auto& l : layers) {
						output.resize(::neu::layer::output_dim(l)*batch_size, queue);
						/*
						std::cout << "whole" << ::neu::layer::whole_output_size(l) << std::endl;
						std::cout << "i" << i << std::endl;
						std::cout << "aa" << output.size() << std::endl;
						*/
						auto output_range = range::to_range(output);
#ifdef NEU_BENCHMARK_ENABLE
						boost::timer t;
#endif //NEU_BENCHMARK_ENABLE
						l.test_forward(batch_size,
							range::to_range(input), output_range, queue);
#ifdef NEU_BENCHMARK_ENABLE
						queue.finish();
						std::cout << "layer" << i << "\ttest_forward\t" << t.elapsed() << " secs" << std::endl;
#endif //NEU_BENCHMARK_ENABLE
						input.swap(output);
						++i;
					}
					range::copy(input, result_output, queue);
				}
Exemple #3
0
				static decltype(auto) call(std::vector<neu::layer::any_layer>& layers,
						InputRange const& initial_delta,
						OutputRange& result_prev_delta,
						boost::compute::command_queue& queue) {
					gpu_vector delta(initial_delta.begin(), initial_delta.end(), queue);
					gpu_vector prev_delta(queue.get_context());

					for(int i = layers.size()-1; i >= 0; --i) {
						auto& l = layers.at(i);
						prev_delta.resize(::neu::layer::whole_input_size(l), queue);
						auto prev_delta_range = range::to_range(prev_delta);
#ifdef NEU_BENCHMARK_ENABLE
						boost::timer t;
#endif //NEU_BENCHMARK_ENABLE
						l.backward(
							range::to_range(delta), prev_delta_range,
							queue);
#ifdef NEU_BENCHMARK_ENABLE
						queue.finish();
						std::cout << "layer" << i << "\tbackward\t" << t.elapsed() << " secs" << std::endl;
#endif //NEU_BENCHMARK_ENABLE
						delta.swap(prev_delta);
					}
					range::copy(delta, result_prev_delta, queue);
				}
void perf_random_number_engine(const size_t size,
                               const size_t trials,
                               compute::command_queue& queue)
{
    typedef typename Engine::result_type T;

    // create random number engine
    Engine engine(queue);

    // create vector on the device
    std::cout << "size = " << size << std::endl;
    compute::vector<T> vector(size, queue.get_context());

    // generate random numbers
    perf_timer t;
    for(size_t i = 0; i < trials; i++){
        t.start();
        engine.generate(vector.begin(), vector.end(), queue);
        queue.finish();
        t.stop();
    }

    // print result
    std::cout << "time: " << t.min_time() / 1e6 << " ms" << std::endl;
    std::cout << "rate: " << perf_rate<T>(size, t.min_time()) << " MB/s" << std::endl;
}
Exemple #5
0
inline void test_fill(T v1, T v2, T v3, bc::command_queue queue) {
    bc::vector<T> vector(4, queue.get_context());
    bc::fill(vector.begin(), vector.end(), v1, queue);
    queue.finish();
    CHECK_RANGE_EQUAL(T, 4, vector, (v1, v1, v1, v1));

    vector.resize(1000, queue);
    bc::fill(vector.begin(), vector.end(), v2, queue);
    queue.finish();
    BOOST_CHECK_EQUAL(vector.front(), v2);
    BOOST_CHECK_EQUAL(vector.back(), v2);

    bc::fill(vector.begin() + 500, vector.end(), v3, queue);
    queue.finish();
    BOOST_CHECK_EQUAL(vector.front(), v2);
    BOOST_CHECK_EQUAL(vector[499], v2);
    BOOST_CHECK_EQUAL(vector[500], v3);
    BOOST_CHECK_EQUAL(vector.back(), v3);
}
Exemple #6
0
    void saxpy(const int num, bool gen = true, int iter = 0)
    {
        static compute::device gpu;
        static compute::context context;
        static compute::command_queue queue;
        static compute::vector<T> x;
        static compute::vector<T> y;
        static compute::vector<T> res;
        static T alpha = 3.5;

        using compute::lambda::_1;
        using compute::lambda::_2;

        if (gen) {
            gpu = compute::system::default_device();
            context = compute::context(gpu);
            queue = compute::command_queue(context, gpu);

            x = compute::vector<T>(num, context);
            std::vector<T> h_x(num);
            std::generate(h_x.begin(), h_x.end(), rand);
            compute::copy(h_x.begin(), h_x.end(), x.begin(), queue);

            y = compute::vector<T>(num, context);
            std::vector<T> h_y(num);
            std::generate(h_y.begin(), h_y.end(), rand);
            compute::copy(h_y.begin(), h_y.end(), y.begin(), queue);

            res = compute::vector<T>(num, context);
            queue.finish();
        }

        for (int i = 0; i < iter; i++) {
            compute::transform(x.begin(), x.end(),
                               y.begin(), res.begin(),
                               alpha * _1 + _2,
                               queue);
        }

        queue.finish();
    }
inline void test_fill_n(T v1, T v2, T v3, bc::command_queue queue) {
    if(boost::is_same<typename bc::scalar_type<T>::type, bc::double_>::value &&
       !queue.get_device().supports_extension("cl_khr_fp64")) {
        std::cerr << "Skipping test_fill_n<" << bc::type_name<T>() << ">() "
                     "on device which doesn't support cl_khr_fp64" << std::endl;
        return;
    }

    bc::vector<T> vector(4, queue.get_context());
    bc::fill_n(vector.begin(), 4, v1, queue);
    queue.finish();
    CHECK_RANGE_EQUAL(T, 4, vector, (v1, v1, v1, v1));

    bc::fill_n(vector.begin(), 3, v2, queue);
    queue.finish();
    CHECK_RANGE_EQUAL(T, 4, vector, (v2, v2, v2, v1));

    bc::fill_n(vector.begin() + 1, 2, v3, queue);
    queue.finish();
    CHECK_RANGE_EQUAL(T, 4, vector, (v2, v3, v3, v1));

    bc::fill_n(vector.begin(), 4, v2, queue);
    queue.finish();
    CHECK_RANGE_EQUAL(T, 4, vector, (v2, v2, v2, v2));

    // fill last element
    bc::fill_n(vector.end() - 1, 1, v3, queue);
    queue.finish();
    CHECK_RANGE_EQUAL(T, 4, vector, (v2, v2, v2, v3));

    // fill first element
    bc::fill_n(vector.begin(), 1, v1, queue);
    queue.finish();
    CHECK_RANGE_EQUAL(T, 4, vector, (v1, v2, v2, v3));
}
Exemple #8
0
inline void test_fill_n(T v1, T v2, T v3, bc::command_queue queue) {
    bc::vector<T> vector(4, queue.get_context());
    bc::fill_n(vector.begin(), 4, v1, queue);
    queue.finish();
    CHECK_RANGE_EQUAL(T, 4, vector, (v1, v1, v1, v1));

    bc::fill_n(vector.begin(), 3, v2, queue);
    queue.finish();
    CHECK_RANGE_EQUAL(T, 4, vector, (v2, v2, v2, v1));

    bc::fill_n(vector.begin() + 1, 2, v3, queue);
    queue.finish();
    CHECK_RANGE_EQUAL(T, 4, vector, (v2, v3, v3, v1));

    bc::fill_n(vector.begin(), 4, v2, queue);
    queue.finish();
    CHECK_RANGE_EQUAL(T, 4, vector, (v2, v2, v2, v2));

    // fill last element
    bc::fill_n(vector.end() - 1, 1, v3, queue);
    queue.finish();
    CHECK_RANGE_EQUAL(T, 4, vector, (v2, v2, v2, v3));

    // fill first element
    bc::fill_n(vector.begin(), 1, v1, queue);
    queue.finish();
    CHECK_RANGE_EQUAL(T, 4, vector, (v1, v2, v2, v3));
}
double perf_accumulate(const compute::vector<T>& data,
                       const size_t trials,
                       compute::command_queue& queue)
{
    perf_timer t;
    for(size_t trial = 0; trial < trials; trial++){
        t.start();
        compute::accumulate(data.begin(), data.end(), T(0), queue);
        queue.finish();
        t.stop();
    }
    return t.min_time();
}
Exemple #10
0
				static decltype(auto) call(std::vector<neu::layer::any_layer>& layers,
						boost::compute::command_queue& queue) {
					int i = 0;
					for(auto& l : layers) {
#ifdef NEU_BENCHMARK_ENABLE
						boost::timer t;
#endif //NEU_BENCHMARK_ENABLE
						l.update(queue);
#ifdef NEU_BENCHMARK_ENABLE
						queue.finish();
						std::cout << "layer" << i << "\tupdate\t" << t.elapsed() << " secs" << std::endl;
#endif //NEU_BENCHMARK_ENABLE
						++i;
					}
				}