int GMRES(const Operator &A, Vector &x, const Vector &b, const Preconditioner &M, Matrix &H, int &m, int &max_iter, Real &tol) { Real resid; int i, j = 1, k; Vector s(m+1), cs(m+1), sn(m+1), w,r,Ax; r=M*b; Real normb = sqrt((r,r)); Ax=b; Ax -=A * x; r = M*(Ax); Real beta = sqrt((r,r)); if ( abs(normb) < 1.e-30) normb = 1; if ((resid = beta / normb) <= tol) { tol = resid; max_iter = 0; return 0; } Vector *v = new Vector[m+1]; while (j <= max_iter) { v[0] = r / beta; s = 0.0; s(0) = beta; for (i = 0; i < m && j <= max_iter; i++, j++) { w = M*(Ax=A * v[i]); for (k = 0; k <= i; k++) { H(k, i) = (w, v[k]); w -= H(k, i) * v[k]; } H(i+1, i) = sqrt((w,w)); v[i+1] = w / H(i+1, i) ; for (k = 0; k < i; k++) ApplyPlaneRotation(H(k,i), H(k+1,i), cs(k), sn(k)); GeneratePlaneRotation(H(i,i), H(i+1,i), cs(i), sn(i)); ApplyPlaneRotation(H(i,i), H(i+1,i), cs(i), sn(i)); ApplyPlaneRotation(s(i), s(i+1), cs(i), sn(i)); if ((resid = abs(s(i+1)) / normb) < tol) { Update(x, i, H, s, v); tol = resid; max_iter = j; delete [] v; return 0; } } Update(x, m - 1, H, s, v); Ax = b; Ax -= A*x; r = M*(Ax); beta = sqrt((r,r)); if ((resid = beta / normb) < tol) { tol = resid; max_iter = j; delete [] v; return 0; } } tol = resid; delete [] v; return 1; }
extern "C" magma_int_t magma_dfgmres( magma_d_matrix A, magma_d_matrix b, magma_d_matrix *x, magma_d_solver_par *solver_par, magma_d_preconditioner *precond_par, magma_queue_t queue ) { magma_int_t info = MAGMA_NOTCONVERGED; magma_int_t dofs = A.num_rows; // prepare solver feedback solver_par->solver = Magma_PGMRES; solver_par->numiter = 0; solver_par->spmv_count = 0; //Chronometry real_Double_t tempo1, tempo2; magma_int_t dim = solver_par->restart; magma_int_t m1 = dim+1; // used inside H macro magma_int_t i, j, k; double beta; double rel_resid, resid0=1, r0=0.0, betanom = 0.0, nom; magma_d_matrix v_t={Magma_CSR}, w_t={Magma_CSR}, t={Magma_CSR}, t2={Magma_CSR}, V={Magma_CSR}, W={Magma_CSR}; v_t.memory_location = Magma_DEV; v_t.num_rows = dofs; v_t.num_cols = 1; v_t.dval = NULL; v_t.storage_type = Magma_DENSE; w_t.memory_location = Magma_DEV; w_t.num_rows = dofs; w_t.num_cols = 1; w_t.dval = NULL; w_t.storage_type = Magma_DENSE; double temp; double *H={0}, *s={0}, *cs={0}, *sn={0}; CHECK( magma_dvinit( &t, Magma_DEV, dofs, 1, MAGMA_D_ZERO, queue )); CHECK( magma_dvinit( &t2, Magma_DEV, dofs, 1, MAGMA_D_ZERO, queue )); CHECK( magma_dmalloc_pinned( &H, (dim+1)*dim )); CHECK( magma_dmalloc_pinned( &s, dim+1 )); CHECK( magma_dmalloc_pinned( &cs, dim )); CHECK( magma_dmalloc_pinned( &sn, dim )); CHECK( magma_dvinit( &V, Magma_DEV, dofs*(dim+1), 1, MAGMA_D_ZERO, queue )); CHECK( magma_dvinit( &W, Magma_DEV, dofs*dim, 1, MAGMA_D_ZERO, queue )); CHECK( magma_dresidual( A, b, *x, &nom, queue)); solver_par->init_res = nom; if ( ( nom * solver_par->rtol) < ATOLERANCE ) r0 = ATOLERANCE; solver_par->numiter = 0; solver_par->spmv_count = 0; tempo1 = magma_sync_wtime( queue ); do { solver_par->numiter++; // compute initial residual and its norm // A.mult(n, 1, x, n, V(0), n); // V(0) = A*x CHECK( magma_d_spmv( MAGMA_D_ONE, A, *x, MAGMA_D_ZERO, t, queue )); solver_par->spmv_count++; magma_dcopy( dofs, t.dval, 1, V(0), 1, queue ); temp = MAGMA_D_MAKE(-1.0, 0.0); magma_daxpy( dofs,temp, b.dval, 1, V(0), 1, queue ); // V(0) = V(0) - b beta = MAGMA_D_MAKE( magma_dnrm2( dofs, V(0), 1, queue ), 0.0 ); // beta = norm(V(0)) if( magma_d_isnan_inf( beta ) ){ info = MAGMA_DIVERGENCE; break; } if (solver_par->numiter == 0){ solver_par->init_res = MAGMA_D_REAL( beta ); resid0 = MAGMA_D_REAL( beta ); r0 = resid0 * solver_par->rtol; if ( r0 < ATOLERANCE ) r0 = ATOLERANCE; if ( resid0 < r0 ) { solver_par->final_res = solver_par->init_res; solver_par->iter_res = solver_par->init_res; info = MAGMA_SUCCESS; goto cleanup; } } if ( solver_par->verbose > 0 ) { solver_par->res_vec[0] = resid0; solver_par->timing[0] = 0.0; } temp = -1.0/beta; magma_dscal( dofs, temp, V(0), 1, queue ); // V(0) = -V(0)/beta // save very first residual norm if (solver_par->numiter == 0) solver_par->init_res = MAGMA_D_REAL( beta ); for (i = 1; i < dim+1; i++) s[i] = MAGMA_D_ZERO; s[0] = beta; i = -1; do { i++; // M.apply(n, 1, V(i), n, W(i), n); v_t.dval = V(i); CHECK( magma_d_applyprecond_left( MagmaNoTrans, A, v_t, &t, precond_par, queue )); CHECK( magma_d_applyprecond_right( MagmaNoTrans, A, t, &t2, precond_par, queue )); magma_dcopy( dofs, t2.dval, 1, W(i), 1, queue ); // A.mult(n, 1, W(i), n, V(i+1), n); w_t.dval = W(i); CHECK( magma_d_spmv( MAGMA_D_ONE, A, w_t, MAGMA_D_ZERO, t, queue )); solver_par->spmv_count++; magma_dcopy( dofs, t.dval, 1, V(i+1), 1, queue ); for (k = 0; k <= i; k++) { H(k, i) = magma_ddot( dofs, V(k), 1, V(i+1), 1, queue ); temp = -H(k,i); // V(i+1) -= H(k, i) * V(k); magma_daxpy( dofs,-H(k,i), V(k), 1, V(i+1), 1, queue ); } H(i+1, i) = MAGMA_D_MAKE( magma_dnrm2( dofs, V(i+1), 1, queue), 0. ); // H(i+1,i) = ||r|| temp = 1.0 / H(i+1, i); // V(i+1) = V(i+1) / H(i+1, i) magma_dscal( dofs, temp, V(i+1), 1, queue ); // (to be fused) for (k = 0; k < i; k++) ApplyPlaneRotation(&H(k,i), &H(k+1,i), cs[k], sn[k]); GeneratePlaneRotation(H(i,i), H(i+1,i), &cs[i], &sn[i]); ApplyPlaneRotation(&H(i,i), &H(i+1,i), cs[i], sn[i]); ApplyPlaneRotation(&s[i], &s[i+1], cs[i], sn[i]); betanom = MAGMA_D_ABS( s[i+1] ); rel_resid = betanom / resid0; if ( solver_par->verbose > 0 ) { tempo2 = magma_sync_wtime( queue ); if ( (solver_par->numiter)%solver_par->verbose==0 ) { solver_par->res_vec[(solver_par->numiter)/solver_par->verbose] = (real_Double_t) betanom; solver_par->timing[(solver_par->numiter)/solver_par->verbose] = (real_Double_t) tempo2-tempo1; } } if (rel_resid <= solver_par->rtol || betanom <= solver_par->atol ){ info = MAGMA_SUCCESS; break; } } while (i+1 < dim && solver_par->numiter+1 <= solver_par->maxiter); // solve upper triangular system in place for (j = i; j >= 0; j--) { s[j] /= H(j,j); for (k = j-1; k >= 0; k--) s[k] -= H(k,j) * s[j]; } // update the solution for (j = 0; j <= i; j++) { // x = x + s[j] * W(j) magma_daxpy( dofs, s[j], W(j), 1, x->dval, 1, queue ); } } while (rel_resid > solver_par->rtol && solver_par->numiter+1 <= solver_par->maxiter); tempo2 = magma_sync_wtime( queue ); solver_par->runtime = (real_Double_t) tempo2-tempo1; double residual; CHECK( magma_dresidual( A, b, *x, &residual, queue )); solver_par->iter_res = betanom; solver_par->final_res = residual; if ( solver_par->numiter < solver_par->maxiter && info == MAGMA_SUCCESS ) { info = MAGMA_SUCCESS; } else if ( solver_par->init_res > solver_par->final_res ) { if ( solver_par->verbose > 0 ) { if ( (solver_par->numiter)%solver_par->verbose==0 ) { solver_par->res_vec[(solver_par->numiter)/solver_par->verbose] = (real_Double_t) betanom; solver_par->timing[(solver_par->numiter)/solver_par->verbose] = (real_Double_t) tempo2-tempo1; } } info = MAGMA_SLOW_CONVERGENCE; if( solver_par->iter_res < solver_par->rtol*solver_par->init_res || solver_par->iter_res < solver_par->atol ) { info = MAGMA_SUCCESS; } } else { if ( solver_par->verbose > 0 ) { if ( (solver_par->numiter)%solver_par->verbose==0 ) { solver_par->res_vec[(solver_par->numiter)/solver_par->verbose] = (real_Double_t) betanom; solver_par->timing[(solver_par->numiter)/solver_par->verbose] = (real_Double_t) tempo2-tempo1; } } info = MAGMA_DIVERGENCE; } cleanup: // free pinned memory magma_free_pinned(s); magma_free_pinned(cs); magma_free_pinned(sn); magma_free_pinned(H); //free DEV memory magma_dmfree( &V, queue); magma_dmfree( &W, queue); magma_dmfree( &t, queue); magma_dmfree( &t2, queue); solver_par->info = info; return info; } /* magma_dfgmres */
bool IterativeSolvers::gmres(const IRCMatrix &A, Vector &x, const Vector &b, const Preconditioner &M) { const idx N = x.getLength(); idx i, j = 1, k; Vector s(maxInnerIter + 1); Vector cs(maxInnerIter + 1); Vector sn(maxInnerIter + 1); Vector w(N); real normb = norm(M.solve(b)); Vector r = M.solve(b - A * x); real beta = norm(r); if (normb == 0.0) normb = 1; real res(norm(r) / normb); if (res <= toler) { toler = res; maxIter = 0; return true; } Vector *v = new Vector[maxInnerIter + 1]; for (idx id = 0; id < maxInnerIter + 1; ++id) v[id] = Vector(N); // CREATE HESSENBERG MATRIX NEEDED TO STORE INTERMEDIATES DenseMatrix H(maxInnerIter + 1, maxInnerIter); Vector temp(N); Vector temp2(maxInnerIter + 1); // MAIN LOOP while (j <= maxIter) { v[0] = r * (1.0 / beta); s = 0.0; s(0) = beta; // INNER ITERATIONS for (i = 0; i < maxInnerIter && j <= maxIter; i++, j++) { // CALCULATE w = M^{-1}(A*v[i]) multiply(A, v[i], temp); M.solveMxb(w, temp); // PRE-CALCULATE DOT PRODUCTS IN PARALLEL // H(k,i) = dot( v[k], w) #ifdef USES_OPENMP #pragma omp parallel for #endif for (k = 0; k <= i ; ++k) { register real dp(0); for (idx id = 0 ; id < N ; ++id) dp += w[id] * v[k][id]; H(k, i) = dp; //dot(w,v[k]); } for (k = 0; k <= i; ++k) { // w -= v[k]*H(k,i) without temporaries register real tempr = H(k, i); #ifdef USES_OPENMP #pragma omp parallel for // why is this loop so critical?? #endif for (idx id = 0 ; id < N ; ++id) w[id] -= v[k][id] * tempr; } // BELOW PARALLEL REGION CALCULATES: // H(i+1,i) = norm(w); // v[i+1] = w * (1.0 / H(i+1, i)); H(i + 1, i) = 0; real tempr(0); #ifdef USES_OPENMP #pragma omp parallel shared(tempr) #endif { #ifdef USES_OPENMP #pragma omp for reduction(+:tempr) #endif for (idx id = 0 ; id < N ; ++id) tempr += w[id] * w[id]; //norm(w); #ifdef USES_OPENMP #pragma omp single #endif { H(i + 1, i) = sqrt(tempr); tempr = (1.0 / H(i + 1, i)); } #ifdef USES_OPENMP #pragma omp for #endif for (idx id = 0 ; id < N ; ++id) v[i + 1][id] = w[id] * tempr; }// end for omp parallel for (k = 0; k < i; k++) ApplyPlaneRotation(H(k, i), H(k + 1, i), cs(k), sn(k)); GeneratePlaneRotation(H(i, i), H(i + 1, i), cs(i), sn(i)); ApplyPlaneRotation(H(i, i), H(i + 1, i), cs(i), sn(i)); ApplyPlaneRotation(s(i), s(i + 1), cs(i), sn(i)); res = fabs(s(i + 1)) / normb; if (res < toler) { // COPY S INTO temp WITHOUT RESIZING for (idx id = 0 ; id < maxInnerIter + 1 ; ++id) temp2[id] = s[id]; Update(x, i, H, temp2, v); toler = res; maxIter = j; delete [] v; return true; } }// end for i IINNER ITERATIONS // COPY S INTO temp WITHOUT RESIZING for (idx id = 0 ; id < maxInnerIter + 1 ; ++id) temp2[id] = s[id]; Update(x, maxInnerIter - 1, H, temp2, v); //multiply(A, x, temp); //r = M.solve(b - A * x); M.solveMxb(r, b - A * x); beta = norm(r); res = beta / normb; if (res < toler) { toler = res; maxIter = j; delete [] v; return true; } } toler = res; delete [] v; return false; }
void gmres(double *A, double *D, double *x, double *b, int N, int max_restart, int max_iter, double tol) { int i, j, k, l, m, N2; double resid, *normb, *beta, *nrm_temp, temp, *M_temp, *r, *q, *v, *M_b, *w, *cs, *sn, *s, *y, *Q, *H; normb = (double *) malloc(1*sizeof(double)); beta = (double *) malloc(1*sizeof(double)); nrm_temp = (double *) malloc(1*sizeof(double)); Q = (double *) malloc(N*N*(max_iter+1)*sizeof(double)); H = (double *) malloc((N+1)*max_iter*sizeof(double)); M_temp = (double *) malloc(N*N*sizeof(double)); M_b = (double *) malloc(N*N*sizeof(double)); r = (double *) malloc(N*N*sizeof(double)); q = (double *) malloc(N*N*sizeof(double)); v = (double *) malloc(N*N*sizeof(double)); w = (double *) malloc(N*N*sizeof(double)); cs = (double *) malloc((max_iter+1)*sizeof(double)); sn = (double *) malloc((max_iter+1)*sizeof(double)); s = (double *) malloc((max_iter+1)*sizeof(double)); y = (double *) malloc((max_iter+1)*sizeof(double)); N2 = N*N; #pragma acc data copyin(b[0:N2]) copyout(M_b[0:N2], r[0:N2], normb[0], beta[0]) { fastpoisson(b, M_b, N); norm_gpu(M_b, normb, N2); #pragma acc parallel loop independent for (k=0; k<N2; k++) r[k] = M_b[k]; norm_gpu(r, beta, N2); } if ((resid = *beta / *normb) <= tol) { tol = resid; max_iter = 0; } for (m=0; m<max_restart; m++) { for (i=0; i<N2; i++) Q[i] = r[i] / *beta; for (i=0; i<max_iter; i++) s[i+1] = 0.0; s[0] = *beta; for (i=0; i<max_iter; i++) { #pragma acc data copyin(D[0:N2]) copy(Q[0:N2*(max_iter+1)], H[0:(N+1)*max_iter], cs[0:max_iter+1], sn[0:max_iter+1], s[0:max_iter+1]) create(q[0:N2], v[0:N2], M_temp[0:N2], w[0:N2], y[0:max_iter+1]) { q_subQ_gpu(q, Q, N2, i); cublas_gemm(N, v, D, q); fastpoisson(v, M_temp, N); #pragma acc parallel loop independent present(w, q, M_temp) for (k=0; k<N*N; k++) w[k] = q[k] + M_temp[k]; // h(k,i) = qk*w for (k=0; k<=i; k++) { #pragma acc parallel loop independent for (j=0; j<N2; j++) { q[j] = Q[N2*k+j]; } dot_gpu(q, w, nrm_temp, N2); H[max_iter*k+i] = *nrm_temp; } #pragma acc parallel loop seq for (k=0; k<=i; k++) { #pragma acc loop independent for (j=0; j<N2; j++) w[j] -= H[max_iter*k+i]*Q[N2*k+j]; } norm_gpu(w, nrm_temp, N2); H[max_iter*(i+1)+i] = *nrm_temp; subQ_v_gpu(Q, w, N2, i+1, H[max_iter*(i+1)+i]); #pragma acc kernels for (k = 0; k < i; k++) { //ApplyPlaneRotation(H(k,i), H(k+1,i), cs(k), sn(k)) temp = cs[k]*H[max_iter*k+i] + sn[k]*H[max_iter*(k+1)+i]; H[max_iter*(k+1)+i] = -1.0*sn[k]*H[max_iter*k+i] + cs[k]*H[max_iter*(k+1)+i]; H[max_iter*k+i] = temp; } // end kernels GeneratePlaneRotation(H[max_iter*i+i], H[max_iter*(i+1)+i], cs, sn, i); #pragma acc kernels { //ApplyPlaneRotation(H(i,i), H(i+1,i), cs(i), sn(i)) H[max_iter*i+i] = cs[i]*H[max_iter*i+i] + sn[i]*H[max_iter*(i+1)+i]; H[max_iter*(i+1)+i] = 0.0; //ApplyPlaneRotation(s(i), s(i+1), cs(i), sn(i)); temp = cs[i]*s[i]; s[i+1] = -1.0*sn[i]*s[i]; s[i] = temp; resid = fabs(s[i+1] / *beta); } // end kernels } // end pragma data if (resid < tol) { backsolve(H, s, y, N, max_iter, i); #pragma acc data copyin(Q[0:N2*(max_iter+1)], y[0:max_iter+1]) copy(x[0:N2]) #pragma acc parallel loop independent for(j=0; j<N; j++) { #pragma acc loop independent for (l=0; l<N; l++) { #pragma acc loop seq for(k=0; k<=i; k++) { x[N*j+l] += Q[N2*k+N*j+l]*y[k]; } } } break; } }//end inside for if (resid < tol) { printf(" resid = %e \n", resid); printf(" Converges at %d cycle %d step. \n", m, i+1); break; } // Caution : i = i + 1. i = i - 1; backsolve(H, s, y, N, max_iter, i); #pragma acc data copyin(Q[0:N2*(max_iter+1)], y[0:max_iter+1]) copy(x[0:N2]) #pragma acc parallel loop independent for(j=0; j<N; j++) { #pragma acc loop independent for (l=0; l<N; l++) { #pragma acc loop seq for(k=0; k<=i; k++) { x[N*j+l] += Q[N2*k+N*j+l]*y[k]; } } } #pragma acc data copyin(D[0:N2], M_b[0:N2], x[0:N2]) create(v[0:N2], M_temp[0:N2]) copyout(r[0:N2]) { cublas_gemm(N, v, D, x); fastpoisson(v, M_temp, N); #pragma acc parallel loop independent for (j=0; j<N2; j++) r[j] = M_b[j] - (x[j] + M_temp[j]); norm_gpu(r, beta, N2); } s[i+1] = *beta; resid = s[i+1] / *normb; if ( resid < tol) { printf(" resid = %e \n", resid); printf(" Converges at %d cycle %d step. \n", m, i); break; } }//end outside for }
void gmres(double *A, double *D, double *x, double *b, int N, int max_restart, int max_iter, double tol) { int i, j, k, l, m, N2; double resid, *normb, *beta, *temp_nrm, temp, *r, *q, *Aq, *qA, *Dq, *w, *cs, *sn, *s, *y, *Q, *H, *res; normb = (double *) malloc(1*sizeof(double)); beta = (double *) malloc(1*sizeof(double)); temp_nrm = (double *) malloc(1*sizeof(double)); Q = (double *) malloc(N*N*(max_iter+1)*sizeof(double)); H = (double *) malloc((N+1)*max_iter*sizeof(double)); r = (double *) malloc(N*N*sizeof(double)); q = (double *) malloc(N*N*sizeof(double)); Aq = (double *) malloc(N*N*sizeof(double)); qA = (double *) malloc(N*N*sizeof(double)); Dq = (double *) malloc(N*N*sizeof(double)); w = (double *) malloc(N*N*sizeof(double)); cs = (double *) malloc((max_iter+1)*sizeof(double)); sn = (double *) malloc((max_iter+1)*sizeof(double)); s = (double *) malloc((max_iter+1)*sizeof(double)); y = (double *) malloc((max_iter+1)*sizeof(double)); res = (double *) malloc(max_iter*sizeof(double)); N2 = N*N; norm(b, normb, N2); for (k=0; k<N2; k++) r[k] = b[k]; norm(r, beta, N2); if ((resid = *beta / *normb) <= tol) { tol = resid; max_iter = 0; } for (m=0; m<max_restart; m++) { for (i=0; i<N2; i++) Q[i] = r[i] / *beta; for (i=0; i<max_iter; i++) s[i+1] = 0.0; s[0] = *beta; for (i = 0; i<max_iter; i++) { q_subQ(q, Q, N2, i); matrix_matrix(A, q, Aq, N); matrix_matrix(q, A, qA, N); matrix_matrix(D, q, Dq, N); for (k=0; k<N2; k++) w[k] = Aq[k] + qA[k] + Dq[k]; for (k=0; k<=i; k++) { q_subQ(q, Q, N2, k); H[max_iter*k+i] = inner_product(q, w, N2); w_shift(w, q, H[max_iter*k+i], N2); } /* for (k=0; k<=i; k++) { H[max_iter*k+i] = 0.0; for (j=0; j<N2; j++) H[max_iter*k+i] += Q[N2*k+j]*w[j]; } for (k=0; k<=i; k++) { for (j=0; j<N2; j++) w[j] = w[j] - H[max_iter*k+i]*Q[N2*k+j]; } */ norm(w, temp_nrm, N2); H[max_iter*(i+1)+i] = *temp_nrm; subQ_v(Q, w, N2, i+1, H[max_iter*(i+1)+i]); for (k = 0; k < i; k++) { //ApplyPlaneRotation(H(k,i), H(k+1,i), cs(k), sn(k)) temp = cs[k]*H[max_iter*k+i] + sn[k]*H[max_iter*(k+1)+i]; H[max_iter*(k+1)+i] = -1.0*sn[k]*H[max_iter*k+i] + cs[k]*H[max_iter*(k+1)+i]; H[max_iter*k+i] = temp; } GeneratePlaneRotation(H[max_iter*i+i], H[max_iter*(i+1)+i], cs, sn, i); //ApplyPlaneRotation(H(i,i), H(i+1,i), cs(i), sn(i)) H[max_iter*i+i] = cs[i]*H[max_iter*i+i] + sn[i]*H[max_iter*(i+1)+i]; H[max_iter*(i+1)+i] = 0.0; //ApplyPlaneRotation(s(i), s(i+1), cs(i), sn(i)); temp = cs[i]*s[i]; s[i+1] = -1.0*sn[i]*s[i]; s[i] = temp; resid = fabs(s[i+1] / *beta); res[i] = resid; if (resid < tol) { // backsolve(H, s, y, N, max_iter, i); for (k=0; k<max_iter+1; k++) y[k] = s[k]; cblas_dtrsv(CblasRowMajor, CblasUpper, CblasNoTrans, CblasNonUnit, i, H, max_iter, y, 1); for(j=0; j<N; j++) { for (l=0; l<N; l++) { for(k=0; k<=i; k++) { x[N*j+l] += Q[N2*k+N*j+l]*y[k]; } } } break; } }//end inside for if (resid < tol) { printf(" resid = %e \n", resid); printf(" Converges at %d cycle %d step. \n", m, i+1); break; } // Caution : i = i + 1. i = i - 1; backsolve(H, s, y, N, max_iter, i); for(j=0; j<N; j++) { for (l=0; l<N; l++) { for(k=0; k<=i; k++) { x[N*j+l] += Q[N2*k+N*j+l]*y[k]; } } } matrix_matrix(A, x, Aq, N); matrix_matrix(x, A, qA, N); matrix_matrix(D, x, Dq, N); for (j=0; j<N2; j++) r[j] = b[j] - (Aq[j] + qA[j] + Dq[j]); norm(r, beta, N2); s[i+1] = *beta; resid = s[i+1] / *normb; if ( resid < tol) { printf(" resid = %e \n", resid); printf(" Converges at %d cycle %d step. \n", m, i); break; } }//end outside for }