void enqueue(std::string const & kernel_prefix, std::vector<lazy_program_compiler> & programs, statements_container const & statements) { viennacl::kernel & kernel = programs[0].program().kernel(kernel_prefix); kernel.local_work_size(0, p_.local_size_0); kernel.local_work_size(1, p_.local_size_1); kernel.global_work_size(0,p_.local_size_0*p_.num_groups_0); kernel.global_work_size(1,p_.local_size_1*p_.num_groups_1); scheduler::statement_node const & root = statements.data().front().array()[statements.data().front().root()]; unsigned int current_arg = 0; if (up_to_internal_size_) { kernel.arg(current_arg++, cl_uint(utils::call_on_matrix(root.lhs, utils::internal_size1_fun()))); kernel.arg(current_arg++, cl_uint(utils::call_on_matrix(root.lhs, utils::internal_size2_fun()))); } else { kernel.arg(current_arg++, cl_uint(utils::call_on_matrix(root.lhs, utils::size1_fun()))); kernel.arg(current_arg++, cl_uint(utils::call_on_matrix(root.lhs, utils::size2_fun()))); } set_arguments(statements, kernel, current_arg); kernel.enqueue(); }
void configure_range_enqueue_arguments(vcl_size_t kernel_id, statements_type const & statements, viennacl::ocl::kernel & k, unsigned int & n_arg) const{ configure_local_sizes(k, kernel_id); k.global_work_size(0,local_size_1_*num_groups_row_); k.global_work_size(1,local_size_2_*num_groups_col_); scheduler::statement_node const & first_node = statements.front().second; k.arg(n_arg++, cl_uint(utils::call_on_matrix(first_node.lhs, utils::internal_size1_fun()))); k.arg(n_arg++, cl_uint(utils::call_on_matrix(first_node.lhs, utils::internal_size2_fun()))); }
void EventList::waitForFinished() { if(_events.empty()) return; cl_int error; if((error = clWaitForEvents(cl_uint(_events.size()), operator const cl_event*())) != CL_SUCCESS) detail::reportError("EventList::waitForFinished() ", error); }
void configure_range_enqueue_arguments(std::size_t kernel_id, statements_type const & statements, viennacl::ocl::kernel & k, unsigned int & n_arg) const{ configure_local_sizes(k, kernel_id); k.global_work_size(0,group_size_*num_groups_); k.global_work_size(1,1); scheduler::statement_node const & first_node = statements.front().second; viennacl::vcl_size_t N = utils::call_on_vector(first_node.lhs, utils::internal_size_fun()); k.arg(n_arg++, cl_uint(N/vectorization_)); }
/** * Reference CPU implementation of Simple Convolution * for performance comparison */ void SimpleConvolution::simpleConvolutionCPUReference(cl_uint *output, const cl_uint *input, const cl_float *mask, const cl_uint width, const cl_uint height, const cl_uint maskWidth, const cl_uint maskHeight) { cl_uint vstep = (maskWidth - 1) / 2; cl_uint hstep = (maskHeight - 1) / 2; // for each pixel in the input for(cl_uint x = 0; x < width; x++) for(cl_uint y = 0; y < height; y++) { /* * find the left, right, top and bottom indices such that * the indices do not go beyond image boundaires */ cl_uint left = (x < vstep) ? 0 : (x - vstep); cl_uint right = ((x + vstep) >= width) ? width - 1 : (x + vstep); cl_uint top = (y < hstep) ? 0 : (y - hstep); cl_uint bottom = ((y + hstep) >= height)? height - 1: (y + hstep); /* * initializing wighted sum value */ cl_float sumFX = 0; for(cl_uint i = left; i <= right; ++i) for(cl_uint j = top ; j <= bottom; ++j) { /* * performing wighted sum within the mask boundaries */ cl_uint maskIndex = (j - (y - hstep)) * maskWidth + (i - (x - vstep)); cl_uint index = j * width + i; /* * to round to the nearest integer */ sumFX += ((float)input[index] * mask[maskIndex]); } sumFX += 0.5f; output[y*width + x] = cl_uint(sumFX); } }
void nmf(viennacl::matrix_base<NumericT> const & V, viennacl::matrix_base<NumericT> & W, viennacl::matrix_base<NumericT> & H, viennacl::linalg::nmf_config const & conf) { viennacl::hsa::context & ctx = const_cast<viennacl::hsa::context &>(viennacl::traits::hsa_context(V)); const std::string NMF_MUL_DIV_KERNEL = "el_wise_mul_div"; viennacl::linalg::opencl::kernels::nmf<NumericT, viennacl::hsa::context>::init(ctx); vcl_size_t k = W.size2(); conf.iters_ = 0; if (viennacl::linalg::norm_frobenius(W) <= 0) W = viennacl::scalar_matrix<NumericT>(W.size1(), W.size2(), NumericT(1), ctx); if (viennacl::linalg::norm_frobenius(H) <= 0) H = viennacl::scalar_matrix<NumericT>(H.size1(), H.size2(), NumericT(1), ctx); viennacl::matrix_base<NumericT> wn(V.size1(), k, W.row_major(), ctx); viennacl::matrix_base<NumericT> wd(V.size1(), k, W.row_major(), ctx); viennacl::matrix_base<NumericT> wtmp(V.size1(), V.size2(), W.row_major(), ctx); viennacl::matrix_base<NumericT> hn(k, V.size2(), H.row_major(), ctx); viennacl::matrix_base<NumericT> hd(k, V.size2(), H.row_major(), ctx); viennacl::matrix_base<NumericT> htmp(k, k, H.row_major(), ctx); viennacl::matrix_base<NumericT> appr(V.size1(), V.size2(), V.row_major(), ctx); NumericT last_diff = 0; NumericT diff_init = 0; bool stagnation_flag = false; for (vcl_size_t i = 0; i < conf.max_iterations(); i++) { conf.iters_ = i + 1; { hn = viennacl::linalg::prod(trans(W), V); htmp = viennacl::linalg::prod(trans(W), W); hd = viennacl::linalg::prod(htmp, H); viennacl::hsa::kernel & mul_div_kernel = ctx.get_kernel(viennacl::linalg::opencl::kernels::nmf<NumericT>::program_name(), NMF_MUL_DIV_KERNEL); viennacl::hsa::enqueue(mul_div_kernel(H, hn, hd, cl_uint(H.internal_size1() * H.internal_size2()))); } { wn = viennacl::linalg::prod(V, trans(H)); wtmp = viennacl::linalg::prod(W, H); wd = viennacl::linalg::prod(wtmp, trans(H)); viennacl::hsa::kernel & mul_div_kernel = ctx.get_kernel(viennacl::linalg::opencl::kernels::nmf<NumericT>::program_name(), NMF_MUL_DIV_KERNEL); viennacl::hsa::enqueue(mul_div_kernel(W, wn, wd, cl_uint(W.internal_size1() * W.internal_size2()))); } if (i % conf.check_after_steps() == 0) //check for convergence { appr = viennacl::linalg::prod(W, H); appr -= V; NumericT diff_val = viennacl::linalg::norm_frobenius(appr); if (i == 0) diff_init = diff_val; if (conf.print_relative_error()) std::cout << diff_val / diff_init << std::endl; // Approximation check if (diff_val / diff_init < conf.tolerance()) break; // Stagnation check if (std::fabs(diff_val - last_diff) / (diff_val * NumericT(conf.check_after_steps())) < conf.stagnation_tolerance()) //avoid situations where convergence stagnates { if (stagnation_flag) // iteration stagnates (two iterates with no notable progress) break; else // record stagnation in this iteration stagnation_flag = true; } else // good progress in this iteration, so unset stagnation flag stagnation_flag = false; // prepare for next iterate: last_diff = diff_val; } } }
inline cl_uint make_options(vcl_size_t length, bool reciprocal, bool flip_sign) { return static_cast<cl_uint>( ((length > 1) ? (cl_uint(length) << 2) : 0) + (reciprocal ? 2 : 0) + (flip_sign ? 1 : 0) ); }
void nmf(viennacl::matrix<ScalarType> const & v, viennacl::matrix<ScalarType> & w, viennacl::matrix<ScalarType> & h, std::size_t k, ScalarType eps = 0.000001, std::size_t max_iter = 10000, std::size_t check_diff_every_step = 100) { viennacl::linalg::kernels::nmf<ScalarType, 1>::init(); w.resize(v.size1(), k); h.resize(k, v.size2()); std::vector<ScalarType> stl_w(w.internal_size1() * w.internal_size2()); std::vector<ScalarType> stl_h(h.internal_size1() * h.internal_size2()); for (std::size_t j = 0; j < stl_w.size(); j++) stl_w[j] = static_cast<ScalarType>(rand()) / RAND_MAX; for (std::size_t j = 0; j < stl_h.size(); j++) stl_h[j] = static_cast<ScalarType>(rand()) / RAND_MAX; viennacl::matrix<ScalarType> wn(v.size1(), k); viennacl::matrix<ScalarType> wd(v.size1(), k); viennacl::matrix<ScalarType> wtmp(v.size1(), v.size2()); viennacl::matrix<ScalarType> hn(k, v.size2()); viennacl::matrix<ScalarType> hd(k, v.size2()); viennacl::matrix<ScalarType> htmp(k, k); viennacl::matrix<ScalarType> appr(v.size1(), v.size2()); viennacl::vector<ScalarType> diff(v.size1() * v.size2()); viennacl::fast_copy(&stl_w[0], &stl_w[0] + stl_w.size(), w); viennacl::fast_copy(&stl_h[0], &stl_h[0] + stl_h.size(), h); ScalarType last_diff = 0.0f; for (std::size_t i = 0; i < max_iter; i++) { { hn = viennacl::linalg::prod(trans(w), v); htmp = viennacl::linalg::prod(trans(w), w); hd = viennacl::linalg::prod(htmp, h); viennacl::ocl::kernel & mul_div_kernel = viennacl::ocl::get_kernel(viennacl::linalg::kernels::nmf<ScalarType, 1>::program_name(), NMF_MUL_DIV_KERNEL); viennacl::ocl::enqueue(mul_div_kernel(h, hn, hd, cl_uint(stl_h.size()))); } { wn = viennacl::linalg::prod(v, trans(h)); wtmp = viennacl::linalg::prod(w, h); wd = viennacl::linalg::prod(wtmp, trans(h)); viennacl::ocl::kernel & mul_div_kernel = viennacl::ocl::get_kernel(viennacl::linalg::kernels::nmf<ScalarType, 1>::program_name(), NMF_MUL_DIV_KERNEL); viennacl::ocl::enqueue(mul_div_kernel(w, wn, wd, cl_uint(stl_w.size()))); } if (i % check_diff_every_step == 0) { appr = viennacl::linalg::prod(w, h); viennacl::ocl::kernel & sub_kernel = viennacl::ocl::get_kernel(viennacl::linalg::kernels::nmf<ScalarType, 1>::program_name(), NMF_SUB_KERNEL); //this is a cheat. i.e save difference of two matrix into vector to get norm_2 viennacl::ocl::enqueue(sub_kernel(appr, v, diff, cl_uint(v.size1() * v.size2()))); ScalarType diff_val = viennacl::linalg::norm_2(diff); if((diff_val < eps) || (fabs(diff_val - last_diff) < eps)) { //std::cout << "Breaked at diff - " << diff_val << "\n"; break; } last_diff = diff_val; //printf("Iteration #%lu - %.5f \n", i, diff_val); } } }
VectorType solve(const MatrixType & matrix, VectorType const & rhs, mixed_precision_cg_tag const & tag) { //typedef typename VectorType::value_type ScalarType; typedef typename viennacl::result_of::value_type<VectorType>::type ScalarType; typedef typename viennacl::result_of::cpu_value_type<ScalarType>::type CPU_ScalarType; //TODO: Assert CPU_ScalarType == double //std::cout << "Starting CG" << std::endl; vcl_size_t problem_size = viennacl::traits::size(rhs); VectorType result(rhs); viennacl::traits::clear(result); VectorType residual = rhs; CPU_ScalarType ip_rr = viennacl::linalg::inner_prod(rhs, rhs); CPU_ScalarType new_ip_rr = 0; CPU_ScalarType norm_rhs_squared = ip_rr; if (norm_rhs_squared == 0) //solution is zero if RHS norm is zero return result; static bool first = true; viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(matrix).context()); if (first) { ctx.add_program(double_float_conversion_program, "double_float_conversion_program"); } viennacl::vector<float> residual_low_precision(problem_size, viennacl::traits::context(rhs)); viennacl::vector<float> result_low_precision(problem_size, viennacl::traits::context(rhs)); viennacl::vector<float> p_low_precision(problem_size, viennacl::traits::context(rhs)); viennacl::vector<float> tmp_low_precision(problem_size, viennacl::traits::context(rhs)); float inner_ip_rr = static_cast<float>(ip_rr); float new_inner_ip_rr = 0; float initial_inner_rhs_norm_squared = static_cast<float>(ip_rr); float alpha; float beta; viennacl::ocl::kernel & assign_double_to_float = ctx.get_kernel("double_float_conversion_program", "assign_double_to_float"); viennacl::ocl::kernel & inplace_add_float_to_double = ctx.get_kernel("double_float_conversion_program", "inplace_add_float_to_double"); // transfer rhs to single precision: viennacl::ocl::enqueue( assign_double_to_float(p_low_precision.handle().opencl_handle(), rhs.handle().opencl_handle(), cl_uint(rhs.size()) ) ); //std::cout << "copying p_low_precision..." << std::endl; //assign_double_to_float(p_low_precision.handle(), residual.handle(), residual.size()); residual_low_precision = p_low_precision; // transfer matrix to single precision: viennacl::compressed_matrix<float> matrix_low_precision(matrix.size1(), matrix.size2(), matrix.nnz(), viennacl::traits::context(rhs)); viennacl::backend::memory_copy(matrix.handle1(), const_cast<viennacl::backend::mem_handle &>(matrix_low_precision.handle1()), 0, 0, sizeof(cl_uint) * (matrix.size1() + 1) ); viennacl::backend::memory_copy(matrix.handle2(), const_cast<viennacl::backend::mem_handle &>(matrix_low_precision.handle2()), 0, 0, sizeof(cl_uint) * (matrix.nnz()) ); viennacl::ocl::enqueue( assign_double_to_float(matrix_low_precision.handle().opencl_handle(), matrix.handle().opencl_handle(), cl_uint(matrix.nnz()) ) ); //std::cout << "copying matrix_low_precision..." << std::endl; //assign_double_to_float(const_cast<viennacl::backend::mem_handle &>(matrix_low_precision.handle()), matrix.handle(), matrix.nnz()); //std::cout << "Starting CG solver iterations... " << std::endl; for (unsigned int i = 0; i < tag.max_iterations(); ++i) { tag.iters(i+1); // lower precision 'inner iteration' tmp_low_precision = viennacl::linalg::prod(matrix_low_precision, p_low_precision); alpha = inner_ip_rr / viennacl::linalg::inner_prod(tmp_low_precision, p_low_precision); result_low_precision += alpha * p_low_precision; residual_low_precision -= alpha * tmp_low_precision; new_inner_ip_rr = viennacl::linalg::inner_prod(residual_low_precision, residual_low_precision); beta = new_inner_ip_rr / inner_ip_rr; inner_ip_rr = new_inner_ip_rr; p_low_precision = residual_low_precision + beta * p_low_precision; if (new_inner_ip_rr < tag.inner_tolerance() * initial_inner_rhs_norm_squared || i == tag.max_iterations()-1) { //std::cout << "outer correction at i=" << i << std::endl; //result += result_low_precision; viennacl::ocl::enqueue( inplace_add_float_to_double(result.handle().opencl_handle(), result_low_precision.handle().opencl_handle(), cl_uint(result.size()) ) ); // residual = b - Ax (without introducing a temporary) residual = viennacl::linalg::prod(matrix, result); residual = rhs - residual; new_ip_rr = viennacl::linalg::inner_prod(residual, residual); if (new_ip_rr / norm_rhs_squared < tag.tolerance() * tag.tolerance())//squared norms involved here break; // p_low_precision = residual; viennacl::ocl::enqueue( assign_double_to_float(p_low_precision.handle().opencl_handle(), residual.handle().opencl_handle(), cl_uint(residual.size()) ) ); result_low_precision.clear(); residual_low_precision = p_low_precision; initial_inner_rhs_norm_squared = static_cast<float>(new_ip_rr); inner_ip_rr = static_cast<float>(new_ip_rr); } } //store last error estimate: tag.error(std::sqrt(new_ip_rr / norm_rhs_squared)); return result; }
static void dump(viennacl::backend::mem_handle const & buff, uniform_tag tag, cl_uint start, cl_uint size){ viennacl::ocl::kernel & k = viennacl::ocl::get_kernel(viennacl::linalg::kernels::rand<ScalarType,1>::program_name(),"dump_uniform"); k.global_work_size(0, viennacl::tools::roundUpToNextMultiple<unsigned int>(size,k.local_work_size(0))); viennacl::ocl::enqueue(k(buff.opencl_handle(), start, size, cl_float(tag.a), cl_float(tag.b) , cl_uint(time(0)))); }
/** * With this let us go right to main(): **/ int main() { typedef float ScalarType; /** * <h2>Part 1: Set up a custom context</h2> * * The following is rather lengthy because OpenCL is a fairly low-level framework. * For comparison, the subsequent code explicitly performs the OpenCL setup that is done * in the background within the 'custom_kernels'-tutorial **/ //manually set up a custom OpenCL context: std::vector<cl_device_id> device_id_array; //get all available devices viennacl::ocl::platform pf; std::cout << "Platform info: " << pf.info() << std::endl; std::vector<viennacl::ocl::device> devices = pf.devices(CL_DEVICE_TYPE_DEFAULT); std::cout << devices[0].name() << std::endl; std::cout << "Number of devices for custom context: " << devices.size() << std::endl; //set up context using all found devices: for (std::size_t i=0; i<devices.size(); ++i) { device_id_array.push_back(devices[i].id()); } std::cout << "Creating context..." << std::endl; cl_int err; cl_context my_context = clCreateContext(0, cl_uint(device_id_array.size()), &(device_id_array[0]), NULL, NULL, &err); VIENNACL_ERR_CHECK(err); //create two Vectors: unsigned int vector_size = 10; std::vector<ScalarType> vec1(vector_size); std::vector<ScalarType> vec2(vector_size); std::vector<ScalarType> result(vector_size); // // fill the operands vec1 and vec2: // for (unsigned int i=0; i<vector_size; ++i) { vec1[i] = static_cast<ScalarType>(i); vec2[i] = static_cast<ScalarType>(vector_size-i); } // // create memory in OpenCL context: // cl_mem mem_vec1 = clCreateBuffer(my_context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, vector_size * sizeof(ScalarType), &(vec1[0]), &err); VIENNACL_ERR_CHECK(err); cl_mem mem_vec2 = clCreateBuffer(my_context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, vector_size * sizeof(ScalarType), &(vec2[0]), &err); VIENNACL_ERR_CHECK(err); cl_mem mem_result = clCreateBuffer(my_context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, vector_size * sizeof(ScalarType), &(result[0]), &err); VIENNACL_ERR_CHECK(err); // // create a command queue for each device: // std::vector<cl_command_queue> queues(devices.size()); for (std::size_t i=0; i<devices.size(); ++i) { queues[i] = clCreateCommandQueue(my_context, devices[i].id(), 0, &err); VIENNACL_ERR_CHECK(err); } // // create and build a program in the context: // std::size_t source_len = std::string(my_compute_program).length(); cl_program my_prog = clCreateProgramWithSource(my_context, 1, &my_compute_program, &source_len, &err); err = clBuildProgram(my_prog, 0, NULL, NULL, NULL, NULL); /* char buffer[1024]; cl_build_status status; clGetProgramBuildInfo(my_prog, devices[1].id(), CL_PROGRAM_BUILD_STATUS, sizeof(cl_build_status), &status, NULL); clGetProgramBuildInfo(my_prog, devices[1].id(), CL_PROGRAM_BUILD_LOG, sizeof(char)*1024, &buffer, NULL); std::cout << "Build Scalar: Err = " << err << " Status = " << status << std::endl; std::cout << "Log: " << buffer << std::endl;*/ VIENNACL_ERR_CHECK(err); // // create a kernel from the program: // const char * kernel_name = "elementwise_prod"; cl_kernel my_kernel = clCreateKernel(my_prog, kernel_name, &err); VIENNACL_ERR_CHECK(err); // // Execute elementwise_prod kernel on first queue: result = vec1 .* vec2; // err = clSetKernelArg(my_kernel, 0, sizeof(cl_mem), (void*)&mem_vec1); VIENNACL_ERR_CHECK(err); err = clSetKernelArg(my_kernel, 1, sizeof(cl_mem), (void*)&mem_vec2); VIENNACL_ERR_CHECK(err); err = clSetKernelArg(my_kernel, 2, sizeof(cl_mem), (void*)&mem_result); VIENNACL_ERR_CHECK(err); err = clSetKernelArg(my_kernel, 3, sizeof(unsigned int), (void*)&vector_size); VIENNACL_ERR_CHECK(err); std::size_t global_size = vector_size; std::size_t local_size = vector_size; err = clEnqueueNDRangeKernel(queues[0], my_kernel, 1, NULL, &global_size, &local_size, 0, NULL, NULL); VIENNACL_ERR_CHECK(err); // // Read and output result: // err = clEnqueueReadBuffer(queues[0], mem_vec1, CL_TRUE, 0, sizeof(ScalarType)*vector_size, &(vec1[0]), 0, NULL, NULL); VIENNACL_ERR_CHECK(err); err = clEnqueueReadBuffer(queues[0], mem_result, CL_TRUE, 0, sizeof(ScalarType)*vector_size, &(result[0]), 0, NULL, NULL); VIENNACL_ERR_CHECK(err); std::cout << "vec1 : "; for (std::size_t i=0; i<vec1.size(); ++i) std::cout << vec1[i] << " "; std::cout << std::endl; std::cout << "vec2 : "; for (std::size_t i=0; i<vec2.size(); ++i) std::cout << vec2[i] << " "; std::cout << std::endl; std::cout << "result: "; for (std::size_t i=0; i<result.size(); ++i) std::cout << result[i] << " "; std::cout << std::endl; /** * <h2>Part 2: Reuse Custom OpenCL Context with ViennaCL</h2> * * To let ViennaCL reuse the previously created context, we need to make it known to ViennaCL \em before any ViennaCL objects are created. * We inject the custom context as the context with default id '0' when using viennacl::ocl::switch_context(). **/ viennacl::ocl::setup_context(0, my_context, device_id_array, queues); viennacl::ocl::switch_context(0); //activate the new context (only mandatory with context-id not equal to zero) /** * Check that ViennaCL really uses the new context: **/ std::cout << "Existing context: " << my_context << std::endl; std::cout << "ViennaCL uses context: " << viennacl::ocl::current_context().handle().get() << std::endl; /** * Wrap existing OpenCL objects into ViennaCL: **/ viennacl::vector<ScalarType> vcl_vec1(mem_vec1, vector_size); viennacl::vector<ScalarType> vcl_vec2(mem_vec2, vector_size); viennacl::vector<ScalarType> vcl_result(mem_result, vector_size); viennacl::scalar<ScalarType> vcl_s = 2.0; std::cout << "Standard vector operations within ViennaCL:" << std::endl; vcl_result = vcl_s * vcl_vec1 + vcl_vec2; std::cout << "vec1 : "; std::cout << vcl_vec1 << std::endl; std::cout << "vec2 : "; std::cout << vcl_vec2 << std::endl; std::cout << "result: "; std::cout << vcl_result << std::endl; /** * We can also reuse the existing elementwise_prod kernel. * Therefore, we first have to make the existing program known to ViennaCL * For more details on the three lines, see tutorial 'custom-kernels' **/ std::cout << "Using existing kernel within the OpenCL backend of ViennaCL:" << std::endl; viennacl::ocl::program & my_vcl_prog = viennacl::ocl::current_context().add_program(my_prog, "my_compute_program"); viennacl::ocl::kernel & my_vcl_kernel = my_vcl_prog.add_kernel(my_kernel, "elementwise_prod"); viennacl::ocl::enqueue(my_vcl_kernel(vcl_vec1, vcl_vec2, vcl_result, static_cast<cl_uint>(vcl_vec1.size()))); //Note that std::size_t might differ between host and device. Thus, a cast to cl_uint is necessary here. std::cout << "vec1 : "; std::cout << vcl_vec1 << std::endl; std::cout << "vec2 : "; std::cout << vcl_vec2 << std::endl; std::cout << "result: "; std::cout << vcl_result << std::endl; /** * Since a linear piece of memory can be interpreted in several ways, * we will now create a 3x3 row-major matrix out of the linear memory in mem_vec1/ * The first three entries in vcl_vec2 and vcl_result are used to carry out matrix-vector products: **/ viennacl::matrix<ScalarType> vcl_matrix(mem_vec1, 3, 3); vcl_vec2.resize(3); //note that the resize operation leads to new memory, thus vcl_vec2 is now at a different memory location (values are copied) vcl_result.resize(3); //note that the resize operation leads to new memory, thus vcl_vec2 is now at a different memory location (values are copied) vcl_result = viennacl::linalg::prod(vcl_matrix, vcl_vec2); std::cout << "result of matrix-vector product: "; std::cout << vcl_result << std::endl; /** * Any further operations can be carried out in the same way. * Just keep in mind that any resizing of vectors or matrices leads to a reallocation of the underlying memory buffer, through which the 'wrapper' is lost. **/ std::cout << "!!!! TUTORIAL COMPLETED SUCCESSFULLY !!!!" << std::endl; return EXIT_SUCCESS; }