static boost::shared_ptr<std::vector<char>> hpxcl_single_calculate(std::vector<float> &a, std::vector<float> &b, std::vector<float> &c, double* t_nonblock, double* t_sync, double* t_finish) { // do nothing if matrices are wrong if(a.size() != b.size() || b.size() != c.size()) { return boost::shared_ptr<std::vector<char>>(); } size_t size = a.size(); // copy data to gpu shared_future<event> write_a_event = hpxcl_single_buffer_a.enqueue_write(0, size*sizeof(float), a.data()); shared_future<event> write_b_event = hpxcl_single_buffer_b.enqueue_write(0, size*sizeof(float), b.data()); shared_future<event> write_c_event = hpxcl_single_buffer_c.enqueue_write(0, size*sizeof(float), c.data()); // wait for write to finish write_a_event.get().await(); write_b_event.get().await(); write_c_event.get().await(); // start time measurement timer_start(); // set work dimensions work_size<1> dim; dim[0].offset = 0; dim[0].size = size; // run exp kernel shared_future<event> kernel_exp_event = hpxcl_single_exp_kernel.enqueue(dim, write_b_event); // run add kernel std::vector<shared_future<event>> add_dependencies; add_dependencies.push_back(kernel_exp_event); add_dependencies.push_back(write_a_event); shared_future<event> kernel_add_event = hpxcl_single_add_kernel.enqueue(dim, add_dependencies); // run dbl kernel shared_future<event> kernel_dbl_event = hpxcl_single_dbl_kernel.enqueue(dim, write_c_event); // run mul kernel std::vector<shared_future<event>> mul_dependencies; mul_dependencies.push_back(kernel_add_event); mul_dependencies.push_back(kernel_dbl_event); shared_future<event> kernel_mul_event = hpxcl_single_mul_kernel.enqueue(dim, mul_dependencies); // run log kernel shared_future<event> kernel_log_event_future = hpxcl_single_log_kernel.enqueue(dim, kernel_mul_event); ////////// UNTIL HERE ALL CALLS WERE NON-BLOCKING ///////////////////////// // get time of non-blocking calls *t_nonblock = timer_stop(); // wait for all nonblocking calls to finish event kernel_log_event = kernel_log_event_future.get(); // get time of synchronization *t_sync = timer_stop(); // wait for the end of the execution kernel_log_event.await(); // get total time of execution *t_finish = timer_stop(); // enqueue result read shared_future<event> read_event_future = hpxcl_single_buffer_z.enqueue_read(0, size*sizeof(float), kernel_log_event); // wait for enqueue_read to return the event event read_event = read_event_future.get(); // wait for calculation to complete and return data boost::shared_ptr<std::vector<char>> data_ptr = read_event.get_data().get(); // return the computed data return data_ptr; }