void inplace_solve(compressed_matrix<SCALARTYPE, MAT_ALIGNMENT> const & U, vector<SCALARTYPE, VEC_ALIGNMENT> & vec, viennacl::linalg::upper_tag) { viennacl::ocl::kernel & k = viennacl::ocl::get_kernel(viennacl::linalg::kernels::compressed_matrix<SCALARTYPE, MAT_ALIGNMENT>::program_name(), "lu_backward"); unsigned int threads = k.local_work_size(); k.global_work_size(k.local_work_size()); viennacl::ocl::enqueue(k(U.handle1().get(), U.handle2().get(), U.handle().get(), viennacl::ocl::local_mem(sizeof(int) * (threads+2)), viennacl::ocl::local_mem(sizeof(SCALARTYPE) * (threads+2)), vec, U.size1())); }
void row_info(compressed_matrix<ScalarType, MAT_ALIGNMENT> const & mat, vector_base<ScalarType> & vec, viennacl::linalg::detail::row_info_types info_selector) { ScalarType const * ptr1 = detail::cuda_arg<ScalarType>(mat.handle().cuda_handle()); ScalarType * ptr2 = detail::cuda_arg<ScalarType>(vec); //csr_row_info_extractor_kernel<<<128, 128>>>(detail::cuda_arg<unsigned int>(mat.handle1().cuda_handle()), // detail::cuda_arg<unsigned int>(mat.handle2().cuda_handle()), // detail::cuda_arg<ScalarType>(mat.handle().cuda_handle()), // detail::cuda_arg<ScalarType>(vec), // static_cast<unsigned int>(mat.size1()), // static_cast<unsigned int>(info_selector) // ); VIENNACL_CUDA_LAST_ERROR_CHECK("csr_row_info_extractor_kernel"); }
void copy(const compressed_matrix<SCALARTYPE, ALIGNMENT> & gpu_matrix, CPU_MATRIX & cpu_matrix ) { if ( gpu_matrix.size1() > 0 && gpu_matrix.size2() > 0 ) { cpu_matrix.resize(gpu_matrix.size1(), gpu_matrix.size2()); //get raw data from memory: std::vector<unsigned int> row_buffer(gpu_matrix.size1() + 1); std::vector<unsigned int> col_buffer(gpu_matrix.nnz()); std::vector<SCALARTYPE> elements(gpu_matrix.nnz()); //std::cout << "GPU->CPU, nonzeros: " << gpu_matrix.nnz() << std::endl; cl_int err; err = clEnqueueReadBuffer(viennacl::ocl::device().queue().get(), gpu_matrix.handle1().get(), CL_TRUE, 0, sizeof(unsigned int)*(gpu_matrix.size1() + 1), &(row_buffer[0]), 0, NULL, NULL); CL_ERR_CHECK(err); err = clEnqueueReadBuffer(viennacl::ocl::device().queue().get(), gpu_matrix.handle2().get(), CL_TRUE, 0, sizeof(unsigned int)*gpu_matrix.nnz(), &(col_buffer[0]), 0, NULL, NULL); CL_ERR_CHECK(err); err = clEnqueueReadBuffer(viennacl::ocl::device().queue().get(), gpu_matrix.handle().get(), CL_TRUE, 0, sizeof(SCALARTYPE)*gpu_matrix.nnz(), &(elements[0]), 0, NULL, NULL); CL_ERR_CHECK(err); viennacl::ocl::finish(); //fill the cpu_matrix: unsigned int data_index = 0; for (unsigned int row = 1; row <= gpu_matrix.size1(); ++row) { while (data_index < row_buffer[row]) { if (col_buffer[data_index] >= gpu_matrix.size1()) { std::cerr << "ViennaCL encountered invalid data at colbuffer[" << data_index << "]: " << col_buffer[data_index] << std::endl; return; } if (elements[data_index] != static_cast<SCALARTYPE>(0.0)) cpu_matrix(row-1, col_buffer[data_index]) = elements[data_index]; ++data_index; } } } }