VectorType solve(const MatrixType & matrix, VectorType const & rhs, cg_tag const & tag, PreconditionerType const & precond) { typedef typename viennacl::result_of::value_type<VectorType>::type ScalarType; typedef typename viennacl::result_of::cpu_value_type<ScalarType>::type CPU_ScalarType; unsigned int problem_size = viennacl::traits::size(rhs); VectorType result(problem_size); viennacl::traits::clear(result); VectorType residual = rhs; VectorType tmp(problem_size); VectorType z = rhs; precond.apply(z); VectorType p = z; CPU_ScalarType ip_rr = viennacl::linalg::inner_prod(residual, z); CPU_ScalarType alpha; CPU_ScalarType new_ip_rr = 0; CPU_ScalarType beta; CPU_ScalarType norm_rhs_squared = ip_rr; CPU_ScalarType new_ipp_rr_over_norm_rhs; if (norm_rhs_squared == 0) //solution is zero if RHS norm is zero return result; for (unsigned int i = 0; i < tag.max_iterations(); ++i) { tag.iters(i+1); tmp = viennacl::linalg::prod(matrix, p); alpha = ip_rr / viennacl::linalg::inner_prod(tmp, p); result += alpha * p; residual -= alpha * tmp; z = residual; precond.apply(z); new_ip_rr = viennacl::linalg::inner_prod(residual, z); new_ipp_rr_over_norm_rhs = new_ip_rr / norm_rhs_squared; if (std::fabs(new_ipp_rr_over_norm_rhs) < tag.tolerance() * tag.tolerance()) //squared norms involved here break; beta = new_ip_rr / ip_rr; ip_rr = new_ip_rr; p = z + beta*p; } //store last error estimate: tag.error(std::sqrt(std::fabs(new_ip_rr / norm_rhs_squared))); return result; }
VectorType solve(const MatrixType & matrix, VectorType const & rhs, cg_tag const & tag, PreconditionerType const & precond) { typedef typename VectorType::value_type ScalarType; typedef typename viennacl::tools::CPU_SCALAR_TYPE_DEDUCER<ScalarType>::ResultType CPU_ScalarType; VectorType result(rhs.size()); result.clear(); VectorType residual = rhs; VectorType tmp(rhs.size()); VectorType z = rhs; precond.apply(z); VectorType p = z; ScalarType ip_rr = viennacl::linalg::inner_prod(residual, z); ScalarType alpha; ScalarType new_ip_rr = 0; ScalarType beta; ScalarType norm_rhs = ip_rr; for (unsigned int i = 0; i < tag.iterations(); ++i) { tag.iters(i+1); tmp = viennacl::linalg::prod(matrix, p); alpha = ip_rr / viennacl::linalg::inner_prod(tmp, p); result += alpha * p; residual -= alpha * tmp; z = residual; precond.apply(z); new_ip_rr = viennacl::linalg::inner_prod(residual, z); if (std::fabs(new_ip_rr / norm_rhs) < tag.tolerance() * tag.tolerance()) //squared norms involved here break; beta = new_ip_rr / ip_rr; ip_rr = new_ip_rr; p = z + beta*p; } //store last error estimate: tag.error(sqrt(std::fabs(new_ip_rr / norm_rhs))); return result; }
VectorType solve(const MatrixType & matrix, VectorType const & rhs, gmres_tag const & tag, PreconditionerType const & precond) { typedef typename viennacl::result_of::value_type<VectorType>::type ScalarType; typedef typename viennacl::result_of::cpu_value_type<ScalarType>::type CPU_ScalarType; unsigned int problem_size = viennacl::traits::size(rhs); VectorType result(problem_size); viennacl::traits::clear(result); unsigned int krylov_dim = tag.krylov_dim(); if (problem_size < tag.krylov_dim()) krylov_dim = problem_size; //A Krylov space larger than the matrix would lead to seg-faults (mathematically, error is certain to be zero already) VectorType res(problem_size); VectorType v_k_tilde(problem_size); VectorType v_k_tilde_temp(problem_size); std::vector< std::vector<CPU_ScalarType> > R(krylov_dim); std::vector<CPU_ScalarType> projection_rhs(krylov_dim); std::vector<VectorType> U(krylov_dim); const CPU_ScalarType gpu_scalar_minus_1 = static_cast<CPU_ScalarType>(-1); //representing the scalar '-1' on the GPU. Prevents blocking write operations const CPU_ScalarType gpu_scalar_1 = static_cast<CPU_ScalarType>(1); //representing the scalar '1' on the GPU. Prevents blocking write operations const CPU_ScalarType gpu_scalar_2 = static_cast<CPU_ScalarType>(2); //representing the scalar '2' on the GPU. Prevents blocking write operations CPU_ScalarType norm_rhs = viennacl::linalg::norm_2(rhs); if (norm_rhs == 0) //solution is zero if RHS norm is zero return result; unsigned int k; for (k = 0; k < krylov_dim; ++k) { R[k].resize(tag.krylov_dim()); viennacl::traits::resize(U[k], problem_size); } //std::cout << "Starting GMRES..." << std::endl; tag.iters(0); for (unsigned int it = 0; it <= tag.max_restarts(); ++it) { //std::cout << "-- GMRES Start " << it << " -- " << std::endl; res = rhs; res -= viennacl::linalg::prod(matrix, result); //initial guess zero precond.apply(res); //std::cout << "Residual: " << res << std::endl; CPU_ScalarType rho_0 = viennacl::linalg::norm_2(res); CPU_ScalarType rho = static_cast<CPU_ScalarType>(1.0); //std::cout << "rho_0: " << rho_0 << std::endl; if (rho_0 / norm_rhs < tag.tolerance() || (norm_rhs == CPU_ScalarType(0.0)) ) { //std::cout << "Allowed Error reached at begin of loop" << std::endl; tag.error(rho_0 / norm_rhs); return result; } res /= rho_0; //std::cout << "Normalized Residual: " << res << std::endl; for (k=0; k<krylov_dim; ++k) { viennacl::traits::clear(R[k]); viennacl::traits::clear(U[k]); R[k].resize(krylov_dim); viennacl::traits::resize(U[k], problem_size); } for (k = 0; k < krylov_dim; ++k) { tag.iters( tag.iters() + 1 ); //increase iteration counter //compute v_k = A * v_{k-1} via Householder matrices if (k == 0) { v_k_tilde = viennacl::linalg::prod(matrix, res); precond.apply(v_k_tilde); } else { viennacl::traits::clear(v_k_tilde); v_k_tilde[k-1] = gpu_scalar_1; //Householder rotations part 1 for (int i = k-1; i > -1; --i) v_k_tilde -= U[i] * (viennacl::linalg::inner_prod(U[i], v_k_tilde) * gpu_scalar_2); v_k_tilde_temp = viennacl::linalg::prod(matrix, v_k_tilde); precond.apply(v_k_tilde_temp); v_k_tilde = v_k_tilde_temp; //Householder rotations part 2 for (unsigned int i = 0; i < k; ++i) v_k_tilde -= U[i] * (viennacl::linalg::inner_prod(U[i], v_k_tilde) * gpu_scalar_2); } //std::cout << "v_k_tilde: " << v_k_tilde << std::endl; viennacl::traits::clear(U[k]); viennacl::traits::resize(U[k], problem_size); //copy first k entries from v_k_tilde to U[k]: gmres_copy_helper(v_k_tilde, U[k], k); U[k][k] = std::sqrt( viennacl::linalg::inner_prod(v_k_tilde, v_k_tilde) - viennacl::linalg::inner_prod(U[k], U[k]) ); if (fabs(U[k][k]) < CPU_ScalarType(10 * std::numeric_limits<CPU_ScalarType>::epsilon())) break; //Note: Solution is essentially (up to round-off error) already in Krylov space. No need to proceed. //copy first k+1 entries from U[k] to R[k] gmres_copy_helper(U[k], R[k], k+1); U[k] -= v_k_tilde; //std::cout << "U[k] before normalization: " << U[k] << std::endl; U[k] *= gpu_scalar_minus_1 / viennacl::linalg::norm_2( U[k] ); //std::cout << "Householder vector U[k]: " << U[k] << std::endl; //DEBUG: Make sure that P_k v_k_tilde equals (rho_{1,k}, ... , rho_{k,k}, 0, 0 ) #ifdef VIENNACL_GMRES_DEBUG std::cout << "P_k v_k_tilde: " << (v_k_tilde - 2.0 * U[k] * inner_prod(U[k], v_k_tilde)) << std::endl; std::cout << "R[k]: [" << R[k].size() << "]("; for (size_t i=0; i<R[k].size(); ++i) std::cout << R[k][i] << ","; std::cout << ")" << std::endl; #endif //std::cout << "P_k res: " << (res - 2.0 * U[k] * inner_prod(U[k], res)) << std::endl; res -= U[k] * (viennacl::linalg::inner_prod( U[k], res ) * gpu_scalar_2); //std::cout << "zeta_k: " << viennacl::linalg::inner_prod( U[k], res ) * gpu_scalar_2 << std::endl; //std::cout << "Updated res: " << res << std::endl; #ifdef VIENNACL_GMRES_DEBUG VectorType v1(U[k].size()); v1.clear(); v1.resize(U[k].size()); v1(0) = 1.0; v1 -= U[k] * (viennacl::linalg::inner_prod( U[k], v1 ) * gpu_scalar_2); std::cout << "v1: " << v1 << std::endl; boost::numeric::ublas::matrix<ScalarType> P = -2.0 * outer_prod(U[k], U[k]); P(0,0) += 1.0; P(1,1) += 1.0; P(2,2) += 1.0; std::cout << "P: " << P << std::endl; #endif if (res[k] > rho) //machine precision reached res[k] = rho; if (res[k] < -1.0 * rho) //machine precision reached res[k] = -1.0 * rho; projection_rhs[k] = res[k]; rho *= std::sin( std::acos(projection_rhs[k] / rho) ); #ifdef VIENNACL_GMRES_DEBUG std::cout << "k-th component of r: " << res[k] << std::endl; std::cout << "New rho (norm of res): " << rho << std::endl; #endif if (std::fabs(rho * rho_0 / norm_rhs) < tag.tolerance()) { //std::cout << "Krylov space big enough" << endl; tag.error( std::fabs(rho*rho_0 / norm_rhs) ); ++k; break; } //std::cout << "Current residual: " << rho * rho_0 << std::endl; //std::cout << " - End of Krylov space setup - " << std::endl; } // for k #ifdef VIENNACL_GMRES_DEBUG //inplace solution of the upper triangular matrix: std::cout << "Upper triangular system:" << std::endl; std::cout << "Size of Krylov space: " << k << std::endl; for (size_t i=0; i<k; ++i) { for (size_t j=0; j<k; ++j) { std::cout << R[j][i] << ", "; } std::cout << " | " << projection_rhs[i] << std::endl; } #endif for (int i=k-1; i>-1; --i) { for (unsigned int j=i+1; j<k; ++j) //temp_rhs[i] -= R[i][j] * temp_rhs[j]; //if R is not transposed projection_rhs[i] -= R[j][i] * projection_rhs[j]; //R is transposed projection_rhs[i] /= R[i][i]; } #ifdef VIENNACL_GMRES_DEBUG std::cout << "Result of triangular solver: "; for (size_t i=0; i<k; ++i) std::cout << projection_rhs[i] << ", "; std::cout << std::endl; #endif res *= projection_rhs[0]; if (k > 0) { for (unsigned int i = 0; i < k-1; ++i) { res[i] += projection_rhs[i+1]; } } for (int i = k-1; i > -1; --i) res -= U[i] * (viennacl::linalg::inner_prod(U[i], res) * gpu_scalar_2); res *= rho_0; result += res; if ( std::fabs(rho*rho_0 / norm_rhs) < tag.tolerance() ) { //std::cout << "Allowed Error reached at end of loop" << std::endl; tag.error(std::fabs(rho*rho_0 / norm_rhs)); return result; } //res = rhs; //res -= viennacl::linalg::prod(matrix, result); //std::cout << "norm_2(r)=" << norm_2(r) << std::endl; //std::cout << "std::abs(rho*rho_0)=" << std::abs(rho*rho_0) << std::endl; //std::cout << r << std::endl; tag.error(std::fabs(rho*rho_0)); } return result; }
VectorType solve(const MatrixType & matrix, VectorType const & rhs, bicgstab_tag const & tag, PreconditionerType const & precond) { typedef typename viennacl::result_of::value_type<VectorType>::type ScalarType; typedef typename viennacl::result_of::cpu_value_type<ScalarType>::type CPU_ScalarType; VectorType result = rhs; viennacl::traits::clear(result); VectorType residual = rhs; VectorType r0star = residual; //can be chosen arbitrarily in fact VectorType tmp0 = rhs; VectorType tmp1 = rhs; VectorType s = rhs; VectorType p = residual; CPU_ScalarType ip_rr0star = viennacl::linalg::norm_2(residual); CPU_ScalarType norm_rhs_host = viennacl::linalg::norm_2(residual); CPU_ScalarType beta; CPU_ScalarType alpha; CPU_ScalarType omega; CPU_ScalarType new_ip_rr0star = 0; CPU_ScalarType residual_norm = norm_rhs_host; if (norm_rhs_host == 0) //solution is zero if RHS norm is zero return result; bool restart_flag = true; vcl_size_t last_restart = 0; for (unsigned int i = 0; i < tag.max_iterations(); ++i) { if (restart_flag) { residual = rhs; residual -= viennacl::linalg::prod(matrix, result); precond.apply(residual); p = residual; r0star = residual; ip_rr0star = viennacl::linalg::norm_2(residual); ip_rr0star *= ip_rr0star; restart_flag = false; last_restart = i; } tag.iters(i+1); tmp0 = viennacl::linalg::prod(matrix, p); precond.apply(tmp0); alpha = ip_rr0star / viennacl::linalg::inner_prod(tmp0, r0star); s = residual - alpha*tmp0; tmp1 = viennacl::linalg::prod(matrix, s); precond.apply(tmp1); CPU_ScalarType norm_tmp1 = viennacl::linalg::norm_2(tmp1); omega = viennacl::linalg::inner_prod(tmp1, s) / (norm_tmp1 * norm_tmp1); result += alpha * p + omega * s; residual = s - omega * tmp1; residual_norm = viennacl::linalg::norm_2(residual); if (residual_norm / norm_rhs_host < tag.tolerance()) break; new_ip_rr0star = viennacl::linalg::inner_prod(residual, r0star); beta = new_ip_rr0star / ip_rr0star * alpha/omega; ip_rr0star = new_ip_rr0star; if (ip_rr0star == 0 || omega == 0 || i - last_restart > tag.max_iterations_before_restart()) //search direction degenerate. A restart might help restart_flag = true; // Execution of // p = residual + beta * (p - omega*tmp0); // without introducing temporary vectors: p -= omega * tmp0; p = residual + beta * p; //std::cout << "Rel. Residual in current step: " << std::sqrt(std::fabs(viennacl::linalg::inner_prod(residual, residual) / norm_rhs_host)) << std::endl; } //store last error estimate: tag.error(residual_norm / norm_rhs_host); return result; }