int gmres_dr(spinor * const P,spinor * const Q, const int m, const int nr_ev, const int max_restarts, const double eps_sq, const int rel_prec, const int N, matrix_mult f){ int restart=0, i, j, k, l; double beta, eps, norm, beta2=0.; complex *lswork = NULL; int lwork; complex tmp1, tmp2; int info=0; int _m = m, mp1 = m+1, np1 = nr_ev+1, ne = nr_ev, V2 = 12*(VOLUMEPLUSRAND)/2, _N = 12*N; spinor ** solver_field = NULL; const int nr_sf = 3; if(N == VOLUME) { init_solver_field(&solver_field, VOLUMEPLUSRAND, nr_sf); } else { init_solver_field(&solver_field, VOLUMEPLUSRAND/2, nr_sf); } double err=0.; spinor * r0, * x0; cmone.re = -1.; cmone.im=0.; cpone.re = 1.; cpone.im=0.; czero.re = 0.; czero.im = 0.; r0 = solver_field[0]; x0 = solver_field[2]; eps=sqrt(eps_sq); init_gmres_dr(m, (VOLUMEPLUSRAND)); norm = sqrt(square_norm(Q, N, 1)); assign(x0, P, N); /* first normal GMRES cycle */ /* r_0=Q-AP (b=Q, x+0=P) */ f(r0, x0); diff(r0, Q, r0, N); /* v_0=r_0/||r_0|| */ alpha[0].re=sqrt(square_norm(r0, N, 1)); err = alpha[0].re; if(g_proc_id == g_stdio_proc && g_debug_level > 0){ printf("%d\t%e true residue\n", restart*m, alpha[0].re*alpha[0].re); fflush(stdout); } if(alpha[0].re==0.){ assign(P, x0, N); finalize_solver(solver_field, nr_sf); return(restart*m); } mul_r(V[0], 1./alpha[0].re, r0, N); for(j = 0; j < m; j++){ /* solver_field[0]=A*v_j */ /* Set h_ij and omega_j */ /* solver_field[1] <- omega_j */ f(solver_field[1], V[j]); /* assign(solver_field[1], solver_field[0], N); */ for(i = 0; i <= j; i++){ H[i][j] = scalar_prod(V[i], solver_field[1], N, 1); /* G, work and work2 are in Fortran storage: columns first */ G[j][i] = H[i][j]; work2[j][i] = H[i][j]; work[i][j].re = H[i][j].re; work[i][j].im = -H[i][j].im; assign_diff_mul(solver_field[1], V[i], H[i][j], N); } _complex_set(H[j+1][j], sqrt(square_norm(solver_field[1], N, 1)), 0.); G[j][j+1] = H[j+1][j]; work2[j][j+1] = H[j+1][j]; work[j+1][j].re = H[j+1][j].re; work[j+1][j].im = -H[j+1][j].im; beta2 = H[j+1][j].re*H[j+1][j].re; for(i = 0; i < j; i++){ tmp1 = H[i][j]; tmp2 = H[i+1][j]; _mult_real(H[i][j], tmp2, s[i]); _add_assign_complex_conj(H[i][j], c[i], tmp1); _mult_real(H[i+1][j], tmp1, s[i]); _diff_assign_complex(H[i+1][j], c[i], tmp2); } /* Set beta, s, c, alpha[j],[j+1] */ beta = sqrt(_complex_square_norm(H[j][j]) + _complex_square_norm(H[j+1][j])); s[j] = H[j+1][j].re / beta; _mult_real(c[j], H[j][j], 1./beta); _complex_set(H[j][j], beta, 0.); _mult_real(alpha[j+1], alpha[j], s[j]); tmp1 = alpha[j]; _mult_assign_complex_conj(alpha[j], c[j], tmp1); /* precision reached? */ if(g_proc_id == g_stdio_proc && g_debug_level > 0){ printf("%d\t%e residue\n", restart*m+j, alpha[j+1].re*alpha[j+1].re); fflush(stdout); } if(((alpha[j+1].re <= eps) && (rel_prec == 0)) || ((alpha[j+1].re <= eps*norm) && (rel_prec == 1))){ _mult_real(alpha[j], alpha[j], 1./H[j][j].re); assign_add_mul(x0, V[j], alpha[j], N); for(i = j-1; i >= 0; i--){ for(k = i+1; k <= j; k++){ _mult_assign_complex(tmp1, H[i][k], alpha[k]); /* alpha[i] -= tmp1 */ _diff_complex(alpha[i], tmp1); } _mult_real(alpha[i], alpha[i], 1./H[i][i].re); assign_add_mul(x0, V[i], alpha[i], N); } for(i = 0; i < m; i++){ alpha[i].im = 0.; } assign(P, x0, N); finalize_solver(solver_field, nr_sf); return(restart*m+j); } /* if not */ else { mul_r(V[(j+1)], 1./H[j+1][j].re, solver_field[1], N); } } j=m-1; /* prepare for restart */ _mult_real(alpha[j], alpha[j], 1./H[j][j].re); assign_add_mul(x0, V[j], alpha[j], N); if(g_proc_id == 0 && g_debug_level > 3) { printf("alpha: %e %e\n", alpha[j].re, alpha[j].im); } for(i = j-1; i >= 0; i--){ for(k = i+1; k <= j; k++){ _mult_assign_complex(tmp1, H[i][k], alpha[k]); _diff_complex(alpha[i], tmp1); } _mult_real(alpha[i], alpha[i], 1./H[i][i].re); if(g_proc_id == 0 && g_debug_level > 3) { printf("alpha: %e %e\n", alpha[i].re, alpha[i].im); } assign_add_mul(x0, V[i], alpha[i], N); } /* This produces c=V_m+1*r0 */ for(i = 0; i < mp1; i++) { c[i] = scalar_prod(V[i], r0, N, 1); if(g_proc_id == 0 && g_debug_level > 3) { printf("c: %e %e err = %e\n", c[i].re, c[i].im, err); } } for(restart = 1; restart < max_restarts; restart++) { /* compute c-\bar H \alpha */ _FT(zgemv) ("N", &mp1, &_m, &cmone, G[0], &mp1, alpha, &one, &cpone, c, &one, 1); err = sqrt(short_scalar_prod(c, c, mp1).re); if(g_proc_id == 0 && g_debug_level > 0) { printf("%d\t %e short residue\n", m*restart, err*err); } /* Compute new residual r0 */ /* r_0=Q-AP (b=Q, x+0=P) */ if(g_debug_level > 0) { f(r0, x0); diff(r0, Q, r0, N); tmp1.im=sqrt(square_norm(r0, N, 1)); if(g_proc_id == g_stdio_proc){ printf("%d\t%e true residue\n", m*restart, tmp1.im*tmp1.im); fflush(stdout); } } mul(r0, c[0], V[0], N); for(i = 1; i < mp1; i++) { assign_add_mul(r0, V[i], c[i], N); } if(g_debug_level > 3) { tmp1.im=sqrt(square_norm(r0, N, 1)); if(g_proc_id == g_stdio_proc){ printf("%d\t%e residue\n", m*restart, tmp1.im*tmp1.im); fflush(stdout); } } /* Stop if satisfied */ if(err < eps){ assign(P, x0, N); finalize_solver(solver_field, nr_sf); return(restart*m); } /* Prepare to compute harmonic Ritz pairs */ for(i = 0; i < m-1; i++){ alpha[i].re = 0.; alpha[i].im = 0.; } alpha[m-1].re = 1.; alpha[m-1].im = 0.; _FT(zgesv) (&_m, &one, work[0], &mp1, idx, alpha, &_m, &info); for(i = 0; i < m; i++) { G[m-1][i].re += (beta2*alpha[idx[i]-1].re); G[m-1][i].im += (beta2*alpha[idx[i]-1].im); } if(g_proc_id == 0 && g_debug_level > 3){ printf("zgesv returned info = %d, c[m-1]= %e, %e , idx[m-1]=%d\n", info, alpha[idx[m-1]-1].re, alpha[idx[m-1]-1].im, idx[m-1]); } /* c - \bar H * d -> c */ /* G contains H + \beta^2 H^-He_n e_n^H */ /* Compute harmonic Ritz pairs */ diagonalise_general_matrix(m, G[0], mp1, alpha, evalues); for(i = 0; i < m; i++) { sortarray[i] = _complex_square_norm(evalues[i]); idx[i] = i; } quicksort(m, sortarray, idx); if(g_proc_id == g_stdio_proc && g_debug_level > 1) { for(i = 0; i < m; i++) { printf("# Evalues %d %e %e \n", i, evalues[idx[i]].re, evalues[idx[i]].im); } fflush(stdout); } /* Copy the first nr_ev eigenvectors to work */ for(i = 0; i < ne; i++) { for(l = 0; l < m; l++) { work[i][l] = G[idx[i]][l]; } } /* Orthonormalize them */ for(i = 0; i < ne; i++) { work[i][m].re = 0.; work[i][m].im = 0.; short_ModifiedGS(work[i], m, i, work[0], mp1); } /* Orthonormalize c - \bar H d to work */ short_ModifiedGS(c, m+1, ne, work[0], mp1); for(i = 0; i < mp1; i++) { work[nr_ev][i] = c[i]; } /* Now compute \bar H = P^T_k+1 \bar H_m P_k */ for(i = 0; i < mp1; i++) { for(l = 0; l < mp1; l++) { H[i][l].re = 0.; H[i][l].im = 0.; } } _FT(zgemm) ("N", "N", &mp1, &ne, &_m, &cpone, work2[0], &mp1, work[0], &mp1, &czero, G[0], &mp1, 1, 1); _FT(zgemm) ("C", "N", &np1, &ne , &mp1, &cpone, work[0], &mp1, G[0], &mp1, &czero, H[0], &mp1, 1, 1); if(g_debug_level > 3) { for(i = 0; i < ne+1; i++) { for(l = 0; l < ne+1; l++) { if(g_proc_id == 0) { printf("(g[%d], g[%d]) = %e, %e\n", i, l, short_scalar_prod(work[i], work[l], m+1).re, short_scalar_prod(work[i], work[l], m+1).im); printf("(g[%d], g[%d]) = %e, %e\n", l, i, short_scalar_prod(work[l], work[i], m+1).re, short_scalar_prod(work[l], work[i], m+1).im); } } } } /* V_k+1 = V_m+1 P_k+1 */ /* _FT(zgemm) ("N", "N", &_N, &np1, &mp1, &cpone, (complex*)V[0], &V2, work[0], &mp1, &czero, (complex*)Z[0], &V2, 1, 1); */ for(l = 0; l < np1; l++) { mul(Z[l], work[l][0], V[0], N); for(i = 1; i < mp1; i++) { assign_add_mul(Z[l], V[i], work[l][i], N); } } /* copy back to V */ for(i = 0; i < np1; i++) { assign(V[i], Z[i], N); } /* Reorthogonalise v_nr_ev */ ModifiedGS((complex*)V[nr_ev], _N, nr_ev, (complex*)V[0], V2); if(g_debug_level > 3) { for(i = 0; i < np1; i++) { for(l = 0; l < np1; l++) { tmp1 = scalar_prod(V[l], V[i], N, 1); if(g_proc_id == 0) { printf("(V[%d], V[%d]) = %e %e %d %d %d %d %d %d %e %e\n", l, i, tmp1.re, tmp1.im, np1, mp1, ne, _m, _N, V2, H[l][i].re, H[l][i].im); } } } } /* Copy the content of H to work, work2 and G */ for(i=0; i < mp1; i++) { for(l = 0; l < mp1; l++) { G[i][l] = H[i][l]; work2[i][l] = H[i][l]; work[l][i].re = H[i][l].re; work[l][i].im = -H[i][l].im; } } for(j = ne; j < m; j++) { /* solver_field[0]=A*v_j */ f(solver_field[1], V[j]); /* Set h_ij and omega_j */ /* solver_field[1] <- omega_j */ /* assign(solver_field[1], solver_field[0], N); */ for(i = 0; i <= j; i++){ H[j][i] = scalar_prod(V[i], solver_field[1], N, 1); /* H, G, work and work2 are now all in Fortran storage: columns first */ G[j][i] = H[j][i]; work2[j][i] = H[j][i]; work[i][j].re = H[j][i].re; work[i][j].im = -H[j][i].im; assign_diff_mul(solver_field[1], V[i], H[j][i], N); } beta2 = square_norm(solver_field[1], N, 1); _complex_set(H[j][j+1], sqrt(beta2), 0.); G[j][j+1] = H[j][j+1]; work2[j][j+1] = H[j][j+1]; work[j+1][j].re = H[j][j+1].re; work[j+1][j].im = -H[j][j+1].im; mul_r(V[(j+1)], 1./H[j][j+1].re, solver_field[1], N); } /* Solve the least square problem for alpha*/ /* This produces c=V_m+1*r0 */ for(i = 0; i < mp1; i++) { c[i] = scalar_prod(V[i], r0, N, 1); alpha[i] = c[i]; if(g_proc_id == 0 && g_debug_level > 3) { printf("c: %e %e err = %e\n", c[i].re, c[i].im, err); } } if(lswork == NULL) { lwork = -1; _FT(zgels) ("N", &mp1, &_m, &one, H[0], &mp1, alpha, &mp1, &tmp1, &lwork, &info, 1); lwork = (int)tmp1.re; lswork = (complex*)malloc(lwork*sizeof(complex)); } _FT(zgels) ("N", &mp1, &_m, &one, H[0], &mp1, alpha, &mp1, lswork, &lwork, &info, 1); if(g_proc_id == 0 && g_debug_level > 3) { printf("zgels returned info = %d\n", info); fflush(stdout); } /* Compute the new solution vector */ for(i = 0; i < m; i++){ if(g_proc_id == 0 && g_debug_level > 3) { printf("alpha: %e %e\n", alpha[i].re, alpha[i].im); } assign_add_mul(x0, V[i], alpha[i], N); } } /* If maximal number of restart is reached */ assign(P, x0, N); finalize_solver(solver_field, nr_sf); return(-1); }
int gmres(spinor * const P,spinor * const Q, const int m, const int max_restarts, const double eps_sq, const int rel_prec, const int N, const int parallel, matrix_mult f){ int restart, i, j, k; double beta, eps, norm; complex tmp1, tmp2; spinor ** solver_field = NULL; const int nr_sf = 3; if(N == VOLUME) { init_solver_field(&solver_field, VOLUMEPLUSRAND, nr_sf); } else { init_solver_field(&solver_field, VOLUMEPLUSRAND/2, nr_sf); } eps=sqrt(eps_sq); init_gmres(m, VOLUMEPLUSRAND); norm = sqrt(square_norm(Q, N, parallel)); assign(solver_field[2], P, N); for(restart = 0; restart < max_restarts; restart++){ /* r_0=Q-AP (b=Q, x+0=P) */ f(solver_field[0], solver_field[2]); diff(solver_field[0], Q, solver_field[0], N); /* v_0=r_0/||r_0|| */ alpha[0].re=sqrt(square_norm(solver_field[0], N, parallel)); if(g_proc_id == g_stdio_proc && g_debug_level > 1){ printf("%d\t%g true residue\n", restart*m, alpha[0].re*alpha[0].re); fflush(stdout); } if(alpha[0].re==0.){ assign(P, solver_field[2], N); finalize_solver(solver_field, nr_sf); return(restart*m); } mul_r(V[0], 1./alpha[0].re, solver_field[0], N); for(j = 0; j < m; j++){ /* solver_field[0]=A*v_j */ f(solver_field[0], V[j]); /* Set h_ij and omega_j */ /* solver_field[1] <- omega_j */ assign(solver_field[1], solver_field[0], N); for(i = 0; i <= j; i++){ H[i][j] = scalar_prod(V[i], solver_field[1], N, parallel); assign_diff_mul(solver_field[1], V[i], H[i][j], N); } _complex_set(H[j+1][j], sqrt(square_norm(solver_field[1], N, parallel)), 0.); for(i = 0; i < j; i++){ tmp1 = H[i][j]; tmp2 = H[i+1][j]; _mult_real(H[i][j], tmp2, s[i]); _add_assign_complex_conj(H[i][j], c[i], tmp1); _mult_real(H[i+1][j], tmp1, s[i]); _diff_assign_complex(H[i+1][j], c[i], tmp2); } /* Set beta, s, c, alpha[j],[j+1] */ beta = sqrt(_complex_square_norm(H[j][j]) + _complex_square_norm(H[j+1][j])); s[j] = H[j+1][j].re / beta; _mult_real(c[j], H[j][j], 1./beta); _complex_set(H[j][j], beta, 0.); _mult_real(alpha[j+1], alpha[j], s[j]); tmp1 = alpha[j]; _mult_assign_complex_conj(alpha[j], c[j], tmp1); /* precision reached? */ if(g_proc_id == g_stdio_proc && g_debug_level > 1){ printf("%d\t%g residue\n", restart*m+j, alpha[j+1].re*alpha[j+1].re); fflush(stdout); } if(((alpha[j+1].re <= eps) && (rel_prec == 0)) || ((alpha[j+1].re <= eps*norm) && (rel_prec == 1))){ _mult_real(alpha[j], alpha[j], 1./H[j][j].re); assign_add_mul(solver_field[2], V[j], alpha[j], N); for(i = j-1; i >= 0; i--){ for(k = i+1; k <= j; k++){ _mult_assign_complex(tmp1, H[i][k], alpha[k]); _diff_complex(alpha[i], tmp1); } _mult_real(alpha[i], alpha[i], 1./H[i][i].re); assign_add_mul(solver_field[2], V[i], alpha[i], N); } for(i = 0; i < m; i++){ alpha[i].im = 0.; } assign(P, solver_field[2], N); finalize_solver(solver_field, nr_sf); return(restart*m+j); } /* if not */ else{ if(j != m-1){ mul_r(V[(j+1)], 1./H[j+1][j].re, solver_field[1], N); } } } j=m-1; /* prepare for restart */ _mult_real(alpha[j], alpha[j], 1./H[j][j].re); assign_add_mul(solver_field[2], V[j], alpha[j], N); for(i = j-1; i >= 0; i--){ for(k = i+1; k <= j; k++){ _mult_assign_complex(tmp1, H[i][k], alpha[k]); _diff_complex(alpha[i], tmp1); } _mult_real(alpha[i], alpha[i], 1./H[i][i].re); assign_add_mul(solver_field[2], V[i], alpha[i], N); } for(i = 0; i < m; i++){ alpha[i].im = 0.; } } /* If maximal number of restarts is reached */ assign(P, solver_field[2], N); finalize_solver(solver_field, nr_sf); return(-1); }