inline void test_fill(T v1, T v2, T v3, bc::command_queue queue) { if(boost::is_same<typename bc::scalar_type<T>::type, bc::double_>::value && !queue.get_device().supports_extension("cl_khr_fp64")) { std::cerr << "Skipping test_fill<" << bc::type_name<T>() << ">() " "on device which doesn't support cl_khr_fp64" << std::endl; return; } bc::vector<T> vector(4, queue.get_context()); bc::fill(vector.begin(), vector.end(), v1, queue); queue.finish(); CHECK_RANGE_EQUAL(T, 4, vector, (v1, v1, v1, v1)); vector.resize(1000, queue); bc::fill(vector.begin(), vector.end(), v2, queue); queue.finish(); BOOST_CHECK_EQUAL(vector.front(), v2); BOOST_CHECK_EQUAL(vector.back(), v2); bc::fill(vector.begin() + 500, vector.end(), v3, queue); queue.finish(); BOOST_CHECK_EQUAL(vector.front(), v2); BOOST_CHECK_EQUAL(vector[499], v2); BOOST_CHECK_EQUAL(vector[500], v3); BOOST_CHECK_EQUAL(vector.back(), v3); }
static decltype(auto) call( std::vector<neu::layer::any_layer>& layers, int batch_size, InputRange const& initial_input, OutputRange& result_output, boost::compute::command_queue& queue) { gpu_vector input(initial_input.begin(), initial_input.end(), queue); gpu_vector output(queue.get_context()); int i = 0; for(auto& l : layers) { output.resize(::neu::layer::output_dim(l)*batch_size, queue); /* std::cout << "whole" << ::neu::layer::whole_output_size(l) << std::endl; std::cout << "i" << i << std::endl; std::cout << "aa" << output.size() << std::endl; */ auto output_range = range::to_range(output); #ifdef NEU_BENCHMARK_ENABLE boost::timer t; #endif //NEU_BENCHMARK_ENABLE l.test_forward(batch_size, range::to_range(input), output_range, queue); #ifdef NEU_BENCHMARK_ENABLE queue.finish(); std::cout << "layer" << i << "\ttest_forward\t" << t.elapsed() << " secs" << std::endl; #endif //NEU_BENCHMARK_ENABLE input.swap(output); ++i; } range::copy(input, result_output, queue); }
static decltype(auto) call(std::vector<neu::layer::any_layer>& layers, InputRange const& initial_delta, OutputRange& result_prev_delta, boost::compute::command_queue& queue) { gpu_vector delta(initial_delta.begin(), initial_delta.end(), queue); gpu_vector prev_delta(queue.get_context()); for(int i = layers.size()-1; i >= 0; --i) { auto& l = layers.at(i); prev_delta.resize(::neu::layer::whole_input_size(l), queue); auto prev_delta_range = range::to_range(prev_delta); #ifdef NEU_BENCHMARK_ENABLE boost::timer t; #endif //NEU_BENCHMARK_ENABLE l.backward( range::to_range(delta), prev_delta_range, queue); #ifdef NEU_BENCHMARK_ENABLE queue.finish(); std::cout << "layer" << i << "\tbackward\t" << t.elapsed() << " secs" << std::endl; #endif //NEU_BENCHMARK_ENABLE delta.swap(prev_delta); } range::copy(delta, result_prev_delta, queue); }
void perf_random_number_engine(const size_t size, const size_t trials, compute::command_queue& queue) { typedef typename Engine::result_type T; // create random number engine Engine engine(queue); // create vector on the device std::cout << "size = " << size << std::endl; compute::vector<T> vector(size, queue.get_context()); // generate random numbers perf_timer t; for(size_t i = 0; i < trials; i++){ t.start(); engine.generate(vector.begin(), vector.end(), queue); queue.finish(); t.stop(); } // print result std::cout << "time: " << t.min_time() / 1e6 << " ms" << std::endl; std::cout << "rate: " << perf_rate<T>(size, t.min_time()) << " MB/s" << std::endl; }
inline void test_fill(T v1, T v2, T v3, bc::command_queue queue) { bc::vector<T> vector(4, queue.get_context()); bc::fill(vector.begin(), vector.end(), v1, queue); queue.finish(); CHECK_RANGE_EQUAL(T, 4, vector, (v1, v1, v1, v1)); vector.resize(1000, queue); bc::fill(vector.begin(), vector.end(), v2, queue); queue.finish(); BOOST_CHECK_EQUAL(vector.front(), v2); BOOST_CHECK_EQUAL(vector.back(), v2); bc::fill(vector.begin() + 500, vector.end(), v3, queue); queue.finish(); BOOST_CHECK_EQUAL(vector.front(), v2); BOOST_CHECK_EQUAL(vector[499], v2); BOOST_CHECK_EQUAL(vector[500], v3); BOOST_CHECK_EQUAL(vector.back(), v3); }
void saxpy(const int num, bool gen = true, int iter = 0) { static compute::device gpu; static compute::context context; static compute::command_queue queue; static compute::vector<T> x; static compute::vector<T> y; static compute::vector<T> res; static T alpha = 3.5; using compute::lambda::_1; using compute::lambda::_2; if (gen) { gpu = compute::system::default_device(); context = compute::context(gpu); queue = compute::command_queue(context, gpu); x = compute::vector<T>(num, context); std::vector<T> h_x(num); std::generate(h_x.begin(), h_x.end(), rand); compute::copy(h_x.begin(), h_x.end(), x.begin(), queue); y = compute::vector<T>(num, context); std::vector<T> h_y(num); std::generate(h_y.begin(), h_y.end(), rand); compute::copy(h_y.begin(), h_y.end(), y.begin(), queue); res = compute::vector<T>(num, context); queue.finish(); } for (int i = 0; i < iter; i++) { compute::transform(x.begin(), x.end(), y.begin(), res.begin(), alpha * _1 + _2, queue); } queue.finish(); }
inline void test_fill_n(T v1, T v2, T v3, bc::command_queue queue) { if(boost::is_same<typename bc::scalar_type<T>::type, bc::double_>::value && !queue.get_device().supports_extension("cl_khr_fp64")) { std::cerr << "Skipping test_fill_n<" << bc::type_name<T>() << ">() " "on device which doesn't support cl_khr_fp64" << std::endl; return; } bc::vector<T> vector(4, queue.get_context()); bc::fill_n(vector.begin(), 4, v1, queue); queue.finish(); CHECK_RANGE_EQUAL(T, 4, vector, (v1, v1, v1, v1)); bc::fill_n(vector.begin(), 3, v2, queue); queue.finish(); CHECK_RANGE_EQUAL(T, 4, vector, (v2, v2, v2, v1)); bc::fill_n(vector.begin() + 1, 2, v3, queue); queue.finish(); CHECK_RANGE_EQUAL(T, 4, vector, (v2, v3, v3, v1)); bc::fill_n(vector.begin(), 4, v2, queue); queue.finish(); CHECK_RANGE_EQUAL(T, 4, vector, (v2, v2, v2, v2)); // fill last element bc::fill_n(vector.end() - 1, 1, v3, queue); queue.finish(); CHECK_RANGE_EQUAL(T, 4, vector, (v2, v2, v2, v3)); // fill first element bc::fill_n(vector.begin(), 1, v1, queue); queue.finish(); CHECK_RANGE_EQUAL(T, 4, vector, (v1, v2, v2, v3)); }
inline void test_fill_n(T v1, T v2, T v3, bc::command_queue queue) { bc::vector<T> vector(4, queue.get_context()); bc::fill_n(vector.begin(), 4, v1, queue); queue.finish(); CHECK_RANGE_EQUAL(T, 4, vector, (v1, v1, v1, v1)); bc::fill_n(vector.begin(), 3, v2, queue); queue.finish(); CHECK_RANGE_EQUAL(T, 4, vector, (v2, v2, v2, v1)); bc::fill_n(vector.begin() + 1, 2, v3, queue); queue.finish(); CHECK_RANGE_EQUAL(T, 4, vector, (v2, v3, v3, v1)); bc::fill_n(vector.begin(), 4, v2, queue); queue.finish(); CHECK_RANGE_EQUAL(T, 4, vector, (v2, v2, v2, v2)); // fill last element bc::fill_n(vector.end() - 1, 1, v3, queue); queue.finish(); CHECK_RANGE_EQUAL(T, 4, vector, (v2, v2, v2, v3)); // fill first element bc::fill_n(vector.begin(), 1, v1, queue); queue.finish(); CHECK_RANGE_EQUAL(T, 4, vector, (v1, v2, v2, v3)); }
double perf_accumulate(const compute::vector<T>& data, const size_t trials, compute::command_queue& queue) { perf_timer t; for(size_t trial = 0; trial < trials; trial++){ t.start(); compute::accumulate(data.begin(), data.end(), T(0), queue); queue.finish(); t.stop(); } return t.min_time(); }
static decltype(auto) call(std::vector<neu::layer::any_layer>& layers, boost::compute::command_queue& queue) { int i = 0; for(auto& l : layers) { #ifdef NEU_BENCHMARK_ENABLE boost::timer t; #endif //NEU_BENCHMARK_ENABLE l.update(queue); #ifdef NEU_BENCHMARK_ENABLE queue.finish(); std::cout << "layer" << i << "\tupdate\t" << t.elapsed() << " secs" << std::endl; #endif //NEU_BENCHMARK_ENABLE ++i; } }