int mr(spinor * const P, spinor * const Q, const int max_iter, const double eps_sq, const int rel_prec, const int N, const int parallel, matrix_mult f){ int i=0; double norm_r,beta; _Complex double alpha; spinor * r; spinor ** solver_field = NULL; const int nr_sf = 3; if(N == VOLUME) { init_solver_field(&solver_field, VOLUMEPLUSRAND, nr_sf); } else { init_solver_field(&solver_field, VOLUMEPLUSRAND/2, nr_sf); } r = solver_field[0]; zero_spinor_field(P, N); f(solver_field[2], P); diff(r, Q, solver_field[2], N); norm_r=square_norm(solver_field[0], N, parallel); if(g_proc_id == g_stdio_proc && g_debug_level > 2) { printf("MR iteration number: %d, |res|^2 = %e\n", i, norm_r); fflush( stdout ); } while((norm_r > eps_sq) && (i < max_iter)){ i++; f(solver_field[1], r); alpha=scalar_prod(solver_field[1], r, N, parallel); beta=square_norm(solver_field[1], N, parallel); alpha /= beta; assign_add_mul(P, r, alpha, N); if(i%50 == 0){ f(solver_field[2], P); } else{ assign_add_mul(solver_field[2], solver_field[1], alpha, N); } diff(r, Q, solver_field[2], N); norm_r=square_norm(solver_field[0], N, parallel); if(g_proc_id == g_stdio_proc && g_debug_level > 2) { printf("# MR iteration= %d |res|^2= %g\n", i, norm_r); fflush(stdout); } } finalize_solver(solver_field, nr_sf); if(norm_r > eps_sq){ return(-1); } return(i); }
void Msap(spinor * const P, spinor * const Q, const int Ncy, const int Niter) { int blk, ncy = 0, eo, vol; spinor * r, * a, * b; double nrm; spinor ** solver_field = NULL; const int nr_sf = 6; /* * here it would be probably better to get the working fields as a parameter * from the calling function */ init_solver_field(&solver_field, VOLUME, nr_sf); r = solver_field[0]; a = solver_field[1]; b = solver_field[2]; for(ncy = 0; ncy < Ncy; ncy++) { /* compute the global residue */ /* this can be done more efficiently */ /* here only a naive implementation */ for(eo = 0; eo < 2; eo++) { D_psi(r, P); diff(r, Q, r, VOLUME); nrm = square_norm(r, VOLUME, 1); if(g_proc_id == 0 && g_debug_level > 2 && eo == 1) { /* GG, was 1 */ printf("Msap: %d %1.3e\n", ncy, nrm); fflush(stdout); } /* choose the even (odd) block */ /*blk = eolist[eo];*/ for (blk = 0; blk < nb_blocks; blk++) { if(block_list[blk].evenodd == eo) { vol = block_list[blk].volume; /* get part of r corresponding to block blk into b */ copy_global_to_block(b, r, blk); // does this work?? i.e. solver_field[3] mrblk(a, b, solver_field[3], Niter, 1.e-31, 1, vol, &dummy_Di, blk); /* add a up to full spinor P */ add_block_to_global(P, a, blk); } } } } finalize_solver(solver_field, nr_sf); return; }
void CGeoSmoother(spinor * const P, spinor * const Q, const int Ncy, const int dummy) { spinor ** solver_field = NULL; const int nr_sf = 5; double musave = g_mu; g_mu = g_mu1; init_solver_field(&solver_field, VOLUMEPLUSRAND/2, nr_sf); convert_lexic_to_eo(solver_field[0], solver_field[1], Q); if(g_c_sw > 0) assign_mul_one_sw_pm_imu_inv(EE,solver_field[2], solver_field[0], g_mu); else assign_mul_one_pm_imu_inv(solver_field[2], solver_field[0], +1., VOLUME/2); Hopping_Matrix(OE, solver_field[4], solver_field[2]); /* The sign is plus, since in Hopping_Matrix */ /* the minus is missing */ assign_mul_add_r(solver_field[4], +1., solver_field[1], VOLUME/2); /* Do the inversion with the preconditioned */ /* matrix to get the odd sites */ gamma5(solver_field[4], solver_field[4], VOLUME/2); if(g_c_sw > 0) { cg_her(solver_field[3], solver_field[4], Ncy, 1.e-8, 1, VOLUME/2, &Qsw_pm_psi); Qsw_minus_psi(solver_field[3], solver_field[3]); /* Reconstruct the even sites */ Hopping_Matrix(EO, solver_field[2], solver_field[3]); assign_mul_one_sw_pm_imu_inv(EE,solver_field[4],solver_field[2], g_mu); } else { cg_her(solver_field[3], solver_field[4], Ncy, 1.e-8, 1, VOLUME/2, &Qtm_pm_psi); Qtm_minus_psi(solver_field[3], solver_field[3]); /* Reconstruct the even sites */ Hopping_Matrix(EO, solver_field[4], solver_field[3]); mul_one_pm_imu_inv(solver_field[4], +1., VOLUME/2); } /* The sign is plus, since in Hopping_Matrix */ /* the minus is missing */ assign_add_mul_r(solver_field[2], solver_field[4], +1., VOLUME/2); convert_eo_to_lexic(P, solver_field[2], solver_field[3]); g_mu = musave; finalize_solver(solver_field, nr_sf); return; }
/* P output = solution , Q input = source */ int cg_mms_tm(spinor ** const P, spinor * const Q, solver_params_t * solver_params, double * cgmms_reached_prec) { static double normsq, pro, err, squarenorm; int iteration, N = solver_params->sdim, no_shifts = solver_params->no_shifts; static double gamma, alpham1; spinor ** solver_field = NULL; double atime, etime; const int nr_sf = 3; atime = gettime(); if(solver_params->sdim == VOLUME) { init_solver_field(&solver_field, VOLUMEPLUSRAND, nr_sf); init_mms_tm(no_shifts, VOLUMEPLUSRAND); } else { init_solver_field(&solver_field, VOLUMEPLUSRAND/2, nr_sf); init_mms_tm(no_shifts, VOLUMEPLUSRAND/2); } zero_spinor_field(P[0], N); alphas[0] = 1.0; betas[0] = 0.0; sigma[0] = solver_params->shifts[0]*solver_params->shifts[0]; if(g_proc_id == 0 && g_debug_level > 1) printf("# CGMMS: shift %d is %e\n", 0, sigma[0]); for(int im = 1; im < no_shifts; im++) { sigma[im] = solver_params->shifts[im]*solver_params->shifts[im] - sigma[0]; if(g_proc_id == 0 && g_debug_level > 1) printf("# CGMMS: shift %d is %e\n", im, sigma[im]); // these will be the result spinor fields zero_spinor_field(P[im], N); // these are intermediate fields assign(ps_mms_solver[im-1], Q, N); zitam1[im] = 1.0; zita[im] = 1.0; alphas[im] = 1.0; betas[im] = 0.0; } /* currently only implemented for P=0 */ squarenorm = square_norm(Q, N, 1); /* if a starting solution vector equal to zero is chosen */ assign(solver_field[0], Q, N); assign(solver_field[1], Q, N); normsq = squarenorm; /* main loop */ for(iteration = 0; iteration < solver_params->max_iter; iteration++) { /* Q^2*p and then (p,Q^2*p) */ solver_params->M_psi(solver_field[2], solver_field[1]); // add the zero's shift assign_add_mul_r(solver_field[2], solver_field[1], sigma[0], N); pro = scalar_prod_r(solver_field[1], solver_field[2], N, 1); /* For the update of the coeff. of the shifted pol. we need alphas[0](i-1) and alpha_cg(i). This is the reason why we need this double definition of alpha */ alpham1 = alphas[0]; /* Compute alphas[0](i+1) */ alphas[0] = normsq/pro; for(int im = 1; im < no_shifts; im++) { /* Now gamma is a temp variable that corresponds to zita(i+1) */ gamma = zita[im]*alpham1/(alphas[0]*betas[0]*(1.-zita[im]/zitam1[im]) + alpham1*(1.+sigma[im]*alphas[0])); // Now zita(i-1) is put equal to the old zita(i) zitam1[im] = zita[im]; // Now zita(i+1) is updated zita[im] = gamma; // Update of alphas(i) = alphas[0](i)*zita(i+1)/zita(i) alphas[im] = alphas[0]*zita[im]/zitam1[im]; // Compute xs(i+1) = xs(i) + alphas(i)*ps(i) assign_add_mul_r(P[im], ps_mms_solver[im-1], alphas[im], N); // in the CG the corrections are decreasing with the iteration number increasing // therefore, we can remove shifts when the norm of the correction vector // falls below a threshold // this is useful for computing time and needed, because otherwise // zita might get smaller than DOUBLE_EPS and, hence, zero if(iteration > 0 && (iteration % 20 == 0) && (im == no_shifts-1)) { double sn = square_norm(ps_mms_solver[im-1], N, 1); if(alphas[no_shifts-1]*alphas[no_shifts-1]*sn <= solver_params->squared_solver_prec) { no_shifts--; if(g_debug_level > 2 && g_proc_id == 0) { printf("# CGMMS: at iteration %d removed one shift, %d remaining\n", iteration, no_shifts); } } } } /* Compute x_(i+1) = x_i + alphas[0](i+1) p_i */ assign_add_mul_r(P[0], solver_field[1], alphas[0], N); /* Compute r_(i+1) = r_i - alphas[0](i+1) Qp_i */ assign_add_mul_r(solver_field[0], solver_field[2], -alphas[0], N); /* Check whether the precision eps_sq is reached */ err = square_norm(solver_field[0], N, 1); if(g_debug_level > 2 && g_proc_id == g_stdio_proc) { printf("# CGMMS iteration: %d residue: %g\n", iteration, err); fflush( stdout ); } if( ((err <= solver_params->squared_solver_prec) && (solver_params->rel_prec == 0)) || ((err <= solver_params->squared_solver_prec*squarenorm) && (solver_params->rel_prec > 0)) || (iteration == solver_params->max_iter -1) ) { /* FIXME temporary output of precision until a better solution can be found */ *cgmms_reached_prec = err; break; } /* Compute betas[0](i+1) = (r(i+1),r(i+1))/(r(i),r(i)) Compute p(i+1) = r(i+1) + beta(i+1)*p(i) */ betas[0] = err/normsq; assign_mul_add_r(solver_field[1], betas[0], solver_field[0], N); normsq = err; /* Compute betas(i+1) = betas[0](i+1)*(zita(i+1)*alphas(i))/(zita(i)*alphas[0](i)) Compute ps(i+1) = zita(i+1)*r(i+1) + betas(i+1)*ps(i) */ for(int im = 1; im < no_shifts; im++) { betas[im] = betas[0]*zita[im]*alphas[im]/(zitam1[im]*alphas[0]); assign_mul_add_mul_r(ps_mms_solver[im-1], solver_field[0], betas[im], zita[im], N); } } etime = gettime(); g_sloppy_precision = 0; if(iteration == solver_params->max_iter -1) iteration = -1; else iteration++; if(g_debug_level > 0 && g_proc_id == 0) { printf("# CGMMS (%d shifts): iter: %d eps_sq: %1.4e %1.4e t/s\n", solver_params->no_shifts, iteration, solver_params->squared_solver_prec, etime - atime); } finalize_solver(solver_field, nr_sf); return(iteration); }
int incr_eigcg(const int N, const int nrhs, const int nrhs1, spinor * const x, spinor * const b, const int ldh, matrix_mult f, const double eps_sq1, const double eps_sq, double restart_eps_sq, const int rand_guess_opt, const int rel_prec, const int maxit, int nev, const int v_max) { /*Static variables and arrays.*/ static spinor **solver_field; /*4 spinor fields*/ static int ncurEvals=0; /* current number of stored eigenvectors */ static int ncurRHS=0; /* current number of the system being solved */ static spinor **evecs; /* accumulated eigenvectors for deflation. */ static void *_evals; static double *evals; /* Ritz values */ static void *_v; static spinor *V; /* work array for eigenvector search basis in eigCG */ static void *_h; static _Complex double *H; /* The ncurEvals^2 matrix: H=evecs'*A*evecs */ static void *_hu; static _Complex double *HU; /* used for diagonalization of H if eigenvalues requested also used as a copy of H if needed*/ static void *_initwork; static _Complex double *initwork; /* vector of size ldh using with init-CG */ static void *_ework; static _Complex double *ework; /* end of the thinking part */ static void *_work; static _Complex double *work; static void *_rwork; static double *rwork; static void *_IPIV; static int *IPIV; /*integer array to store permutations when solving the small linear system*/ /* some constants */ char cU='U'; char cN='N'; char cV='V'; _Complex double tpone= 1.0e+00; _Complex double tzero= 0.0e+00; //tpone.re=+1.0e+00; tpone.im=0.0e+00; //tzero.re=+0.0e+00; tzero.im=0.0e+00; /* Timing vars */ double wt1,wt2,wE,wI; double eps_sq_used; /* Variables */ double machEps = 1e-15; double normb, normsq, tmpd,tmpd2; _Complex double tempz; int i,j, ONE = 1; int tmpsize,tmpi,info=0; int numIts, flag, nAdded, nev_used; int maxit_remain; int esize,nrsf; int parallel; /* for parallel processing of the scalar products */ /* leading dimension for spinor vectors */ int LDN; if(N==VOLUME) LDN = VOLUMEPLUSRAND; else LDN = VOLUMEPLUSRAND/2; #ifdef MPI parallel=1; #else parallel=0; #endif /*think more about this */ esize=2*12*N+4*nev*nev; /* fixed size for ework used for restarting in eigcg*/ nrsf=4; /*number of solver fields */ int lwork=3*ldh; double cur_res; //current residual squared (initial value will be computed in eigcg) /*increment the RHS counter*/ ncurRHS = ncurRHS +1; //set the tolerance to be used for this right-hand side if(ncurRHS > nrhs1){ eps_sq_used = eps_sq; } else{ eps_sq_used = eps_sq1; } if(ncurRHS==1)/* If this is the first system, allocate needed memory for the solver*/ { init_solver_field(&solver_field, LDN, nrsf); } if(nev==0){ /*incremental eigcg is used as a cg solver. No need to restart forcing no-restart*/ if(g_proc_id == g_stdio_proc && g_debug_level > 0) { fprintf(stdout, "CG won't be restarted in this mode since no deflation will take place (nev=0)\n"); fflush(stdout); } restart_eps_sq=0.0; } if((ncurRHS==1) && (nev >0) )/* If this is the first right-hand side and eigenvectors are needed, allocate needed memory*/ { init_solver_field(&evecs, LDN, ldh); #if (defined SSE || defined SSE2 || defined SSE3) /*Extra elements are needed for allignment */ //_v = malloc(LDN*v_max*sizeof(spinor)+ALIGN_BASE); _v = calloc(LDN*v_max+ALIGN_BASE,sizeof(spinor)); V = (spinor *)(((unsigned long int)(_v)+ALIGN_BASE)&~ALIGN_BASE); //_h=malloc(ldh*ldh*sizeof(_Complex double )+ALIGN_BASE); _h=calloc(ldh*ldh+ALIGN_BASE,sizeof(_Complex double )); H = (_Complex double *)(((unsigned long int)(_h)+ALIGN_BASE)&~ALIGN_BASE); //_hu=malloc(ldh*ldh*sizeof(_Complex double )+ALIGN_BASE); _hu=calloc(ldh*ldh+ALIGN_BASE,sizeof(_Complex double )); HU = (_Complex double *)(((unsigned long int)(_hu)+ALIGN_BASE)&~ALIGN_BASE); //_ework = malloc(esize*sizeof(_Complex double )+ALIGN_BASE); _ework = calloc(esize+ALIGN_BASE,sizeof(_Complex double )); ework=(_Complex double *)(((unsigned long int)(_ework)+ALIGN_BASE)&~ALIGN_BASE); //_initwork = malloc(ldh*sizeof(_Complex double )+ALIGN_BASE); _initwork = calloc(ldh+ALIGN_BASE,sizeof(_Complex double )); initwork = (_Complex double *)(((unsigned long int)(_initwork)+ALIGN_BASE)&~ALIGN_BASE); //_work = malloc(lwork*sizeof(_Complex double )+ALIGN_BASE); _work = calloc(lwork+ALIGN_BASE,sizeof(_Complex double )); work = (_Complex double *)(((unsigned long int)(_work)+ALIGN_BASE)&~ALIGN_BASE); //_rwork = malloc(3*ldh*sizeof(double)+ALIGN_BASE); _rwork = calloc(3*ldh+ALIGN_BASE,sizeof(double)); rwork = (double *)(((unsigned long int)(_rwork)+ALIGN_BASE)&~ALIGN_BASE); //_IPIV = malloc(ldh*sizeof(int)+ALIGN_BASE); _IPIV = calloc(ldh+ALIGN_BASE,sizeof(int)); IPIV = (int *)(((unsigned long int)(_IPIV)+ALIGN_BASE)&~ALIGN_BASE); //_evals = malloc(ldh*sizeof(double)+ALIGN_BASE); _evals = calloc(ldh+ALIGN_BASE,sizeof(double)); evals = (double *)(((unsigned long int)(_evals)+ALIGN_BASE)&~ALIGN_BASE); #else V = (spinor *) calloc(LDN*v_max,sizeof(spinor)); H = calloc(ldh*ldh, sizeof(_Complex double )); HU= calloc(ldh*ldh, sizeof(_Complex double )); initwork = calloc(ldh, sizeof(_Complex double )); ework = calloc(esize, sizeof(_Complex double )); work = calloc(lwork,sizeof(_Complex double )); rwork= calloc(3*ldh,sizeof(double)); IPIV = calloc(ldh, sizeof(int)); evals = (double *) calloc(ldh, sizeof(double)); #endif } /*if(ncurRHS==1)*/ if(g_proc_id == g_stdio_proc && g_debug_level > 0) { fprintf(stdout, "System %d, eps_sq %e\n",ncurRHS,eps_sq_used); fflush(stdout); } /*---------------------------------------------------------------*/ /* Call eigCG until this right-hand side converges */ /*---------------------------------------------------------------*/ wE = 0.0; wI = 0.0; /* Start accumulator timers */ flag = -1; /* First time through. Run eigCG regularly */ maxit_remain = maxit; /* Initialize Max and current # of iters */ numIts = 0; while( flag == -1 || flag == 3) { //if(g_proc_id==g_stdio_proc) //printf("flag= %d, ncurEvals= %d\n",flag,ncurEvals); if(ncurEvals > 0) { /* --------------------------------------------------------- */ /* Perform init-CG with evecs vectors */ /* xinit = xinit + evecs*Hinv*evec'*(b-Ax0) */ /* --------------------------------------------------------- */ wt1 = gettime(); /*r0=b-Ax0*/ normsq = square_norm(x,N,parallel); if(normsq>0.0) { f(solver_field[0],x); /* solver_field[0]= A*x */ diff(solver_field[1],b,solver_field[0],N); /* solver_filed[1]=b-A*x */ } else assign(solver_field[1],b,N); /* solver_field[1]=b */ /* apply the deflation using init-CG */ /* evecs'*(b-Ax) */ for(i=0; i<ncurEvals; i++) { initwork[i]= scalar_prod(evecs[i],solver_field[1],N,parallel); } /* solve the linear system H y = c */ tmpsize=ldh*ncurEvals; _FT(zcopy) (&tmpsize,H,&ONE,HU,&ONE); /* copy H into HU */ _FT(zgesv) (&ncurEvals,&ONE,HU,&ldh,IPIV,initwork,&ldh,&info); if(info != 0) { if(g_proc_id == g_stdio_proc) { fprintf(stderr, "Error in ZGESV:, info = %d\n",info); fflush(stderr); } exit(1); } /* x = x + evecs*inv(H)*evecs'*r */ for(i=0; i<ncurEvals; i++) { assign_add_mul(x,evecs[i],initwork[i],N); } /* compute elapsed time and add to accumulator */ wt2 = gettime(); wI = wI + wt2-wt1; }/* if(ncurEvals > 0) */ /* ------------------------------------------------------------ */ /* Adjust nev for eigcg according to available ldh/restart */ /* ------------------------------------------------------------ */ if (flag == 3) { /* restart with the same rhs, set nev_used = 0 */ nev_used = 0; /* if convergence seems before next restart do not restart again */ if(rel_prec) { if (cur_res*(restart_eps_sq) < eps_sq*normb*normb) restart_eps_sq=0.0; } else { if (cur_res*(restart_eps_sq) < eps_sq) restart_eps_sq=0.0; } /* if(rel_prec) */ } else { /* First time through this rhs. Find nev evecs */ /* limited by the ldh evecs we can store in total */ if (ldh-ncurEvals < nev) nev = ldh - ncurEvals; nev_used = nev; } /* ------------------------------------------------------------ */ /* Solve Ax = b with x initial guess */ /* ------------------------------------------------------------ */ wt1 = gettime(); eigcg( N, LDN, x, b, &normb, eps_sq_used, restart_eps_sq, rel_prec, maxit_remain, &numIts, &cur_res, &flag, solver_field, f, nev_used, v_max, V, esize, ework); //if(g_proc_id == g_stdio_proc) //printf("eigcg flag= %d \n",flag); wt2 = gettime(); wE = wE + wt2-wt1; /* if flag == 3 update the remain max number of iterations */ maxit_remain = maxit - numIts; } /* end while (flag ==-1 || flag == 3) */ /* ------------------------------------------------ */ /* ---------- */ /* Reporting */ /* ---------- */ /* compute the exact residual */ f(solver_field[0],x); /* solver_field[0]= A*x */ diff(solver_field[1],b,solver_field[0],N); /* solver_filed[1]=b-A*x */ normsq=square_norm(solver_field[1],N,parallel); if(g_debug_level > 0 && g_proc_id == g_stdio_proc) { fprintf(stdout, "For this rhs:\n"); fprintf(stdout, "Total initCG Wallclock : %-f\n", wI); fprintf(stdout, "Total eigpcg Wallclock : %-f\n", wE); fprintf(stdout, "Iterations: %-d\n", numIts); fprintf(stdout, "Residual: %e, Actual Resid of LinSys : %e\n", cur_res,normsq); if (flag != 0) { fprintf(stderr, "Error: eigcg returned with nonzero exit status\n"); return flag; fflush(stderr); } fflush(stdout); } /* ------------------------------------------------------------------- */ /* ------------------------------------------------------------------- */ /* Update the evecs and the factorization of evecs'*A*evecs */ /* ------------------------------------------------------------------- */ if (nev > 0) { wt1 = gettime(); /* Append new Ritz vectors to the basis and orthogonalize them to evecs */ for(i=0; i<nev_used; i++) assign(evecs[i+ncurEvals],&V[i*LDN],N); nAdded = ortho_new_vectors(evecs,N,ncurEvals,nev_used,machEps); /* expand H */ for(j=ncurEvals; j< (ncurEvals+nAdded); j++) { f(solver_field[0],evecs[j]); for(i=0; i<=j; i++) { H[i+j*ldh] = scalar_prod(evecs[i],solver_field[0],N,parallel); H[j+i*ldh]= conj(H[i+j*ldh]); //H[j+i*ldh].re = H[i+j*ldh].re; //H[j+i*ldh].im = -H[i+j*ldh].im; } } /* update the number of vectors in the basis */ ncurEvals = ncurEvals + nAdded; /* ---------- */ /* Reporting */ /* ---------- */ wt2 = gettime(); if(g_proc_id == g_stdio_proc && g_debug_level > 0) { fprintf(stdout,"ncurRHS %d\n",ncurRHS); fprintf(stdout,"ncurEvals %d \n",ncurEvals); fprintf(stdout,"Update\n"); fprintf(stdout,"Added %d vecs\n",nAdded); fprintf(stdout,"U Wallclock : %-f\n", wt2-wt1); fprintf(stdout,"Note: Update Wall time doesn't include time for computing eigenvalues and their residuals.\n"); fflush(stdout); } if(g_debug_level > 3) /*compute eigenvalues and their residuals if requested*/ { /* copy H into HU */ tmpsize=ldh*ncurEvals; _FT(zcopy) (&tmpsize,H,&ONE,HU,&ONE); /* compute eigenvalues and eigenvectors of HU (using V and spinor fields as tmp work spaces)*/ _FT(zheev)(&cV, &cU, &ncurEvals, HU, &ldh, evals, work, &lwork, rwork, &info,1,1); if(info != 0) { if(g_proc_id == g_stdio_proc) { fprintf(stderr,"Error in ZHEEV:, info = %d\n",info); fflush(stderr); } exit(1); } /* compute residuals and print out results */ for(i=0; i<ncurEvals; i++) { tmpi=12*N; tmpsize=12*LDN; _FT(zgemv)(&cN,&tmpi,&ncurEvals,&tpone,(_Complex double *)evecs[0],&tmpsize, &HU[i*ldh], &ONE,&tzero,(_Complex double *) solver_field[0],&ONE,1); normsq=square_norm(solver_field[0],N,parallel); f(solver_field[1],solver_field[0]); tempz = scalar_prod(solver_field[0],solver_field[1],N,parallel); evals[i] = creal(tempz)/normsq; mul_r(solver_field[2],evals[i],solver_field[0],N); diff(solver_field[3],solver_field[1],solver_field[2], N); tmpd2= square_norm(solver_field[3],N,parallel); tmpd= sqrt(tmpd2/normsq); if(g_proc_id == g_stdio_proc) {fprintf(stdout,"RR Eval[%d]: %22.15E rnorm: %22.15E\n", i+1, evals[i], tmpd); fflush(stdout);} } }/*if(plvl >= 2)*/ } /* if(nev>0) */ /*--------------------------------------*/ /*free memory that is no longer needed */ /* and reset ncurRHS and ncurEvals */ /*--------------------------------------*/ if(ncurRHS == nrhs) /*this was the last system to be solved */ { ncurRHS=0; ncurEvals=0; finalize_solver(solver_field,nrsf); } if( (ncurRHS == nrhs) && (nev >0) )/*this was the last system to be solved and there were allocated memory for eigenvector computation*/ { finalize_solver(evecs,ldh); #if (defined SSE || defined SSE2 || defined SSE3) free(_v); free(_h); free(_hu); free(_ework); free(_initwork); free(_IPIV); free(_evals); free(_rwork); free(_work); #else free(V); free(H); free(HU); free(ework); free(initwork); free(IPIV); free(evals); free(rwork); free(work); #endif } return numIts; }
int gcr(spinor * const P, spinor * const Q, const int m, const int max_restarts, const double eps_sq, const int rel_prec, const int N, const int precon, matrix_mult f) { int k, l, restart, i, iter = 0; double norm_sq, err; spinor * rho, * tmp; complex ctmp; spinor ** solver_field = NULL; const int nr_sf = 2; if(N == VOLUME) { init_solver_field(&solver_field, VOLUMEPLUSRAND, nr_sf); } else { init_solver_field(&solver_field, VOLUMEPLUSRAND/2, nr_sf); } rho = solver_field[0]; tmp = solver_field[1]; init_gcr(m, N+RAND); norm_sq = square_norm(Q, N, 1); if(norm_sq < 1.e-32) { norm_sq = 1.; } for(restart = 0; restart < max_restarts; restart++) { dfl_sloppy_prec = 0; f(tmp, P); diff(rho, Q, tmp, N); err = square_norm(rho, N, 1); if(g_proc_id == g_stdio_proc && g_debug_level > 2){ printf("GCR: iteration number: %d, true residue: %g\n", iter, err); fflush(stdout); } if(((err <= eps_sq) && (rel_prec == 0)) || ((err <= eps_sq*norm_sq) && (rel_prec == 1))) { finalize_solver(solver_field, nr_sf); return(iter); } for(k = 0; k < m; k++) { if(precon == 0) { assign(xi[k], rho, N); } else { zero_spinor_field(xi[k], N); Msap_eo(xi[k], rho, 6); /* Msap(xi[k], rho, 8); */ } dfl_sloppy_prec = 1; dfl_little_D_prec = 1.e-12; f(tmp, xi[k]); /* tmp will become chi[k] */ for(l = 0; l < k; l++) { a[l][k] = scalar_prod(chi[l], tmp, N, 1); assign_diff_mul(tmp, chi[l], a[l][k], N); } b[k] = sqrt(square_norm(tmp, N, 1)); mul_r(chi[k], 1./b[k], tmp, N); c[k] = scalar_prod(chi[k], rho, N, 1); assign_diff_mul(rho, chi[k], c[k], N); err = square_norm(rho, N, 1); iter ++; if(g_proc_id == g_stdio_proc && g_debug_level > 0){ if(rel_prec == 1) printf("# GCR: %d\t%g >= %g iterated residue\n", iter, err, eps_sq*norm_sq); else printf("# GCR: %d\t%g >= %giterated residue\n", iter, err, eps_sq); fflush(stdout); } /* Precision reached? */ if((k == m-1) || ((err <= eps_sq) && (rel_prec == 0)) || ((err <= eps_sq*norm_sq) && (rel_prec == 1))) { break; } } /* prepare for restart */ _mult_real(c[k], c[k], 1./b[k]); assign_add_mul(P, xi[k], c[k], N); for(l = k-1; l >= 0; l--) { for(i = l+1; i <= k; i++) { _mult_assign_complex(ctmp, a[l][i], c[i]); /* c[l] -= ctmp */ _diff_complex(c[l], ctmp); } _mult_real(c[l], c[l], 1./b[l]); assign_add_mul(P, xi[l], c[l], N); } } finalize_solver(solver_field, nr_sf); return(-1); }
void Msap_eo(spinor * const P, spinor * const Q, const int Ncy, const int Niter) { int ncy = 0, vol, vols; spinor * r, * a, * b; double nrm; double musave = g_mu; double kappasave = g_kappa; spinor ** solver_field = NULL; // also get space for mrblk! 6 = 3+3 const int nr_sf = 6; if(kappa_Msap > 0) { g_kappa = kappa_Msap; } if(mu_Msap > -10) { g_mu = mu_Msap; // make sure the sign is correct! if(g_mu*musave < 0) g_mu *= -1.; } boundary(g_kappa); /* * here it would be probably better to get the working fields as a parameter * from the calling function */ vols = block_list[0].volume/2+block_list[0].spinpad; vol = block_list[0].volume/2; init_solver_field(&solver_field, nb_blocks*2*vols, nr_sf); r = solver_field[0]; a = solver_field[1]; b = solver_field[2]; int * blk_e_list = malloc(nb_blocks/2*sizeof(int)); int * blk_o_list = malloc(nb_blocks/2*sizeof(int)); int iblke = 0, iblko = 0; for(int blk = 0; blk < nb_blocks; blk++) { if (block_list[blk].evenodd == 0) { blk_e_list[iblke] = blk; iblke++; } else { blk_o_list[iblko] = blk; iblko++; } } for(ncy = 0; ncy < Ncy; ncy++) { /* compute the global residue */ /* this can be done more efficiently */ /* here only a naive implementation */ for(int eo = 0; eo < 2; eo++) { D_psi(r, P); diff(r, Q, r, VOLUME); nrm = square_norm(r, VOLUME, 1); if(g_proc_id == 0 && g_debug_level > 2 && eo == 0) { printf("Msap_eo: %d %1.3e mu = %e\n", ncy, nrm, g_mu/2./g_kappa); fflush(stdout); } int * blk_eo_list; if(eo == 0) { blk_eo_list = blk_e_list; } else { blk_eo_list = blk_o_list; } /* choose the even (odd) block */ // rely on nested parallelism // #ifdef TM_USE_OMP # pragma omp parallel for #endif for (int iblk = 0; iblk < nb_blocks/2; iblk++) { int blk = blk_eo_list[iblk]; spinor32 * b_even = (spinor32*) (b + blk*2*vols); spinor32 * b_odd = (spinor32*) (b +blk*2*vols + vols); spinor32 * a_even = (spinor32*) (a + blk*2*vols); spinor32 * a_odd = (spinor32*) (a + blk*2*vols + vols); // mrblk needs 3 solver fields which we distribute according to the block number spinor32 * c = (spinor32*) (solver_field[3] + blk*2*3*vols); /* get part of r corresponding to block blk into b_even and b_odd */ copy_global_to_block_eo_32(b_even, b_odd, r, blk); if(g_c_sw > 0) { assign_mul_one_sw_pm_imu_inv_block_32(EE, a_even, b_even, g_mu, &block_list[blk]); Block_H_psi_32(&block_list[blk], a_odd, a_even, OE); /* a_odd = b_odd - a_odd */ diff_32(a_odd, b_odd, a_odd, vol); mrblk_32(b_odd, a_odd, c, Niter, 1.e-31, 1, vol, &Msw_plus_block_psi_32, blk); Block_H_psi_32(&block_list[blk], b_even, b_odd, EO); assign_32(c, b_even, vol); assign_mul_one_sw_pm_imu_inv_block_32(EE, b_even, c, g_mu, &block_list[blk]); } else { assign_mul_one_pm_imu_inv_32(a_even, b_even, +1., vol); Block_H_psi_32(&block_list[blk], a_odd, a_even, OE); /* a_odd = b_odd - a_odd */ diff_32(a_odd, b_odd, a_odd, vol); mrblk_32(b_odd, a_odd, c, Niter, 1.e-31, 1, vol, &Mtm_plus_block_psi_32, blk); Block_H_psi_32(&block_list[blk], b_even, b_odd, EO); mul_one_pm_imu_inv_32(b_even, +1., vol); } /* a_even = a_even - b_even */ diff_32(a_even, a_even, b_even, vol); /* add even and odd part up to full spinor P */ add_eo_block_32_to_global(P, a_even, b_odd, blk); } } } free(blk_e_list); free(blk_o_list); finalize_solver(solver_field, nr_sf); g_mu = musave; g_kappa = kappasave; boundary(g_kappa); return; }
/* P output = solution , Q input = source */ int pcg_her(spinor * const P, spinor * const Q, const int max_iter, double eps_sq, const int rel_prec, const int N, matrix_mult f) { double normsp, pro, pro2, err, alpha_cg, beta_cg, squarenorm; int iteration; spinor ** solver_field = NULL; const int nr_sf = 5; if(N == VOLUME) { init_solver_field(&solver_field, VOLUMEPLUSRAND, nr_sf); } else { init_solver_field(&solver_field, VOLUMEPLUSRAND/2, nr_sf); } squarenorm = square_norm(Q, N, 1); /* !!!! INITIALIZATION !!!! */ assign(solver_field[0], P, N); /* (r_0,r_0) = normsq */ normsp = square_norm(P, N, 1); assign(solver_field[3], Q, N); /* initialize residue r and search vector p */ if(normsp==0){ /* if a starting solution vector equal to zero is chosen */ /* r0 */ assign(solver_field[1], solver_field[3], N); /* p0 */ } else{ /* if a starting solution vector different from zero is chosen */ /* r0 = b - A x0 */ f(solver_field[2], solver_field[0]); diff(solver_field[1], solver_field[3], solver_field[2], N); } /* z0 = M^-1 r0 */ invert_eigenvalue_part(solver_field[3], solver_field[1], 10, N); /* p0 = z0 */ assign(solver_field[2], solver_field[3], N); /* Is this really real? */ pro2 = scalar_prod_r(solver_field[1], solver_field[3], N, 1); /* main loop */ for(iteration = 0; iteration < max_iter; iteration++) { /* A p */ f(solver_field[4], solver_field[2]); pro = scalar_prod_r(solver_field[2], solver_field[4], N, 1); /* Compute alpha_cg(i+1) */ alpha_cg=pro2/pro; /* Compute x_(i+1) = x_i + alpha_cg(i+1) p_i */ assign_add_mul_r(solver_field[0], solver_field[2], alpha_cg, N); /* Compute r_(i+1) = r_i - alpha_cg(i+1) Qp_i */ assign_add_mul_r(solver_field[1], solver_field[4], -alpha_cg, N); /* Check whether the precision is reached ... */ err=square_norm(solver_field[1], N, 1); if(g_debug_level > 1 && g_proc_id == g_stdio_proc) { printf("%d\t%g\n",iteration,err); fflush( stdout); } if(((err <= eps_sq) && (rel_prec == 0)) || ((err <= eps_sq*squarenorm) && (rel_prec == 1))) { assign(P, solver_field[0], N); g_sloppy_precision = 0; finalize_solver(solver_field, nr_sf); return(iteration+1); } #ifdef _USE_HALFSPINOR if(((err*err <= eps_sq) && (rel_prec == 0)) || ((err*err <= eps_sq*squarenorm) && (rel_prec == 1)) || iteration > 1400) { g_sloppy_precision = 1; if(g_debug_level > 2 && g_proc_id == g_stdio_proc) { printf("sloppy precision on\n"); fflush( stdout); } } #endif /* z_j */ beta_cg = 1/pro2; /* invert_eigenvalue_part(solver_field[3], solver_field[1], 10, N); */ /* Compute beta_cg(i+1) Compute p_(i+1) = r_i+1 + beta_(i+1) p_i */ pro2 = scalar_prod_r(solver_field[1], solver_field[3], N, 1); beta_cg *= pro2; assign_mul_add_r(solver_field[2], beta_cg, solver_field[3], N); } assign(P, solver_field[0], N); g_sloppy_precision = 0; /* return(-1); */ finalize_solver(solver_field, nr_sf); return(1); }
int cr(spinor * const P, spinor * const Q, const int m, const int max_restarts, const double eps_sq, const int rel_prec, const int N, const int precon, matrix_mult f) { int k, l, restart, i, iter = 0; double norm_sq, err; spinor * xi, * Axi, * chi, * Achi, *tmp; _Complex double alpha, beta; static _Complex double one = 1.0; double norm, rAr, newrAr; double atime, etime; spinor ** solver_field = NULL; const int nr_sf = 5; int save_sloppy = g_sloppy_precision; if(N == VOLUME) { init_solver_field(&solver_field, VOLUMEPLUSRAND, nr_sf); } else { init_solver_field(&solver_field, VOLUMEPLUSRAND/2, nr_sf); } atime = gettime(); xi = solver_field[0]; Axi = solver_field[1]; chi = solver_field[2]; Achi = solver_field[3]; tmp = solver_field[4]; norm_sq = square_norm(Q, N, 1); if(norm_sq < 1.e-32) { norm_sq = 1.; } dfl_sloppy_prec = 0; f(tmp, P); diff(chi, Q, tmp, N); assign(xi, chi, N); f(Axi, xi); f(Achi, chi); rAr = scalar_prod(chi, Achi, N, 1); err = square_norm(chi, N, 1); if(((err <= eps_sq) && (rel_prec == 0)) || ((err <= eps_sq*norm_sq) && (rel_prec == 1))) { finalize_solver(solver_field, nr_sf); return(iter); } for(k = 0; k < m; k++) { dfl_sloppy_prec = 1; norm = square_norm(Axi, N, 1); alpha = rAr/norm; assign_add_mul(P, xi, alpha, N); /* get the new residual */ assign_diff_mul(chi, Axi, alpha, N); err = square_norm(chi, N, 1); iter ++; etime = gettime(); if(g_proc_id == g_stdio_proc && g_debug_level > 3){ printf("# CR: %d\t%g iterated residue, time spent %f s\n", iter, err, (etime - atime)); fflush(stdout); } /* Precision reached? */ if((k == m-1) || ((err <= eps_sq) && (rel_prec == 0)) || ((err <= eps_sq*norm_sq) && (rel_prec == 1))) { break; } #ifdef _USE_HALFSPINOR if(((err*err <= eps_sq) && (rel_prec == 0)) || ((err*err <= eps_sq*norm_sq) && (rel_prec == 1))) { if (g_sloppy_precision_flag == 1) { g_sloppy_precision = 1; if(g_debug_level > 2 && g_proc_id == g_stdio_proc) { printf("sloppy precision on\n"); fflush( stdout); } } } #endif f(Achi, chi); newrAr = scalar_prod(chi, Achi, N, 1); beta = newrAr/rAr; assign_mul_add_mul(xi, beta, chi, one, N); assign_mul_add_mul(Axi,beta, Achi, one, N); rAr = newrAr; } g_sloppy_precision = save_sloppy; finalize_solver(solver_field, nr_sf); return(-1); }
/* P inout (guess for the solving spinor) Q input */ int bicgstab_complex(spinor * const P,spinor * const Q, const int max_iter, double eps_sq, const int rel_prec, const int N, matrix_mult f){ double err, d1, squarenorm; complex rho0, rho1, omega, alpha, beta, nom, denom; int i; spinor * r, * p, * v, *hatr, * s, * t; spinor ** solver_field = NULL; const int nr_sf = 6; if(N == VOLUME) { init_solver_field(&solver_field, VOLUMEPLUSRAND, nr_sf); } else { init_solver_field(&solver_field, VOLUMEPLUSRAND/2, nr_sf); } hatr = solver_field[0]; r = solver_field[1]; v = solver_field[2]; p = solver_field[3]; s = solver_field[4]; t = solver_field[5]; f(r, P); diff(p, Q, r, N); assign(r, p, N); assign(hatr, p, N); rho0 = scalar_prod(hatr, r, N, 1); squarenorm = square_norm(Q, N, 1); for(i = 0; i < max_iter; i++){ err = square_norm(r, N, 1); if(g_proc_id == g_stdio_proc && g_debug_level > 1) { printf("%d %e\n", i, err); fflush(stdout); } if((((err <= eps_sq) && (rel_prec == 0)) || ((err <= eps_sq*squarenorm) && (rel_prec == 1))) && i>0) { finalize_solver(solver_field, nr_sf); return(i); } f(v, p); denom = scalar_prod(hatr, v, N, 1); _div_complex(alpha, rho0, denom); assign(s, r, N); assign_diff_mul(s, v, alpha, N); f(t, s); omega = scalar_prod(t,s, N, 1); d1 = square_norm(t, N, 1); omega.re/=d1; omega.im/=d1; assign_add_mul_add_mul(P, p, s, alpha, omega, N); assign(r, s, N); assign_diff_mul(r, t, omega, N); rho1 = scalar_prod(hatr, r, N, 1); if(fabs(rho1.re) < 1.e-25 && fabs(rho1.im) < 1.e-25) { finalize_solver(solver_field, nr_sf); return(-1); } _mult_assign_complex(nom, alpha, rho1); _mult_assign_complex(denom, omega, rho0); _div_complex(beta, nom, denom); omega.re=-omega.re; omega.im=-omega.im; assign_mul_bra_add_mul_ket_add(p, v, r, omega, beta, N); rho0.re = rho1.re; rho0.im = rho1.im; } finalize_solver(solver_field, nr_sf); return -1; }
int bicgstab2(spinor * const x0, spinor * const b, const int max_iter, double eps_sq, const int rel_prec, const int N, matrix_mult f) { const int l = 2; double err; int i, j, k; int update_app = 0, update_res = 0; double rho0, rho1, beta, alpha, omega, gamma_hat, sigma, kappa0, kappal, rho, zeta0; double squarenorm, Mx=0., Mr=0.; spinor * r[5], * u[5], * r0_tilde, * u0, * x, * xp, * bp; double Z[3][3], y0[3], yl[3], yp[3], ypp[3]; spinor ** solver_field = NULL; const int nr_sf = 10; k = -l; if(N == VOLUME) { init_solver_field(&solver_field, VOLUMEPLUSRAND, nr_sf); } else { init_solver_field(&solver_field, VOLUMEPLUSRAND/2, nr_sf); } r0_tilde = solver_field[0]; u0 = solver_field[1]; r[0] = solver_field[2]; u[0] = solver_field[3]; r[1] = solver_field[4]; u[1] = solver_field[5]; r[2] = solver_field[6]; u[2] = solver_field[7]; bp = solver_field[8]; xp = x0; x = solver_field[9]; zero_spinor_field(x, N); assign(u[0], b, N); f(r0_tilde, xp); diff(r[0], u[0], r0_tilde, N); zero_spinor_field(u0, N); assign(r0_tilde, r[0], N); /* random_spinor_field(r0_tilde, N); */ assign(bp, r[0], N); squarenorm = square_norm(b, N, 1); rho0 = 1.; alpha = rho0; omega = rho0; err = square_norm(r[0], N, 1); Mr = err; Mx = err; zeta0 = err; while( k < max_iter && (((err > eps_sq) && (rel_prec == 0)) || ((err > eps_sq*squarenorm) && (rel_prec == 1)) )) { k+=l; /* The BiCG part */ rho0 *= -omega; for(j = 0; j < l; j++) { rho1 = scalar_prod_r(r[j], r0_tilde, N, 1); beta = alpha*(rho1/rho0); rho0 = rho1; /* if(g_proc_id == 0) {printf("beta = %e, alpha = %e, rho0 = %e\n", beta, alpha, rho0);fflush(stdout);} */ for(i = 0; i <= j; i++) { /* u_i = r_i - \beta u_i */ assign_mul_add_r(u[i], -beta, r[i], N); } f(u[j+1], u[j]); sigma = scalar_prod_r(u[j+1], r0_tilde, N, 1); alpha = rho1/sigma; /* if(g_proc_id == 0) {printf("sigma = %e, alpha = %e\n", sigma, alpha);fflush(stdout);} */ /* x = x + \alpha u_0 */ assign_add_mul_r(x, u[0], alpha, N); /* r_i = r_i - \alpha u_{i+1} */ for(i = 0; i <= j; i++) { assign_add_mul_r(r[i], u[i+1], -alpha, N); } f(r[j+1], r[j]); err = square_norm(r[j+1], N, 1); if(g_proc_id == 0 && g_debug_level > 1) {printf("%d %d err = %e\n", k, j, err);fflush(stdout);} if(err > Mr) Mr = err; if(err > Mx) Mx = err; } /* The polynomial part */ /* Z = R* R */ for(i = 0; i <= l; i++){ for(j = 0; j <= i; j++){ Z[i][j] = scalar_prod_r(r[j], r[i], N, 1); Z[j][i] = Z[i][j]; } } /* r0tilde and rl_tilde */ y0[0] = -1; y0[2] = 0.; y0[1] = Z[1][0]/Z[1][1]; yl[0] = 0.; yl[2] = -1.; yl[1] = Z[1][2]/Z[1][1]; /* Convex combination */ for(i = 0; i < l+1; i++){ yp[i] = 0.; ypp[i] = 0.; for(j = 0; j < l+1; j++) { yp[i] +=Z[i][j]*y0[j]; ypp[i] +=Z[i][j]*yl[j]; } } kappa0 = sqrt( y0[0]*yp[0] + y0[1]*yp[1] + y0[2]*yp[2] ); kappal = sqrt( yl[0]*ypp[0] + yl[1]*ypp[1] + yl[2]*ypp[2] ); rho = (yl[0]*yp[0] + yl[1]*yp[1] + yl[2]*yp[2])/kappa0/kappal; if(fabs(rho) > 0.7) { gamma_hat = rho; } else { gamma_hat = rho*0.7/fabs(rho); } for(i = 0; i <= l; i++) { y0[i] -= gamma_hat*kappa0*yl[i]/kappal; } /* Update */ omega = y0[l]; for(i = 1; i < l+1; i++) { assign_add_mul_r(u[0], u[i], -y0[i], N); assign_add_mul_r(x, r[i-1], y0[i], N); assign_add_mul_r(r[0], r[i], -y0[i], N); } err = kappa0*kappa0; /* Reliable update part */ if(err > Mr) Mr = err; if(err > Mx) Mx = err; update_app = (err < 1.e-4*zeta0 && zeta0 <= Mx); update_res = ((err < 1.e-4*Mr && zeta0 <= Mr) || update_app); if(update_res) { if(g_proc_id == 0 && g_debug_level > 1) printf("Update res\n"); f(r[0], x); diff(r[0], bp, r[0], N); Mr = err; if(update_app) { if(g_proc_id == 0 && g_debug_level > 1) printf("Update app\n"); Mx = err; assign_add_mul_r(xp, x, 1., N); zero_spinor_field(x, N); assign(bp, r[0], N); } } update_app = 0; update_res = 0; if(g_proc_id == 0 && g_debug_level > 0){ printf(" BiCGstab(2)convex iterated %d %d, %e rho0 = %e, alpha = %e, gamma_hat= %e\n", l, k, err, rho0, alpha, gamma_hat); fflush( stdout ); } } assign_add_mul_r(x, xp, 1., N); assign(x0, x, N); if(k == max_iter) return(-1); return(k); }
/* P output = solution , Q input = source */ int mixed_cg_her(spinor * const P, spinor * const Q, solver_params_t solver_params, const int max_iter, double eps_sq, const int rel_prec, const int N, matrix_mult f, matrix_mult32 f32) { int i = 0, iter = 0, j = 0; float sqnrm = 0., sqnrm2, squarenorm; float pro, err, alpha_cg, beta_cg; double sourcesquarenorm, sqnrm_d, squarenorm_d; spinor *delta, *y, *xhigh; spinor32 *x, *stmp; spinor ** solver_field = NULL; spinor32 ** solver_field32 = NULL; const int nr_sf = 3; const int nr_sf32 = 4; int max_inner_it = mixcg_maxinnersolverit; int N_outer = max_iter/max_inner_it; //to be on the save side we allow at least 10 outer iterations if(N_outer < 10) N_outer = 10; int save_sloppy = g_sloppy_precision_flag; double atime, etime, flops; if(N == VOLUME) { init_solver_field(&solver_field, VOLUMEPLUSRAND, nr_sf); init_solver_field_32(&solver_field32, VOLUMEPLUSRAND, nr_sf32); } else { init_solver_field(&solver_field, VOLUMEPLUSRAND/2, nr_sf); init_solver_field_32(&solver_field32, VOLUMEPLUSRAND/2, nr_sf32); } squarenorm_d = square_norm(Q, N, 1); sourcesquarenorm = squarenorm_d; sqnrm_d = squarenorm_d; delta = solver_field[0]; y = solver_field[1]; xhigh = solver_field[2]; x = solver_field32[3]; assign(delta, Q, N); //set solution to zero zero_spinor_field(P, N); atime = gettime(); for(i = 0; i < N_outer; i++) { /* main CG loop in lower precision */ zero_spinor_field_32(x, N); zero_spinor_field_32(solver_field32[0], N); assign_to_32(solver_field32[1], delta, N); assign_to_32(solver_field32[2], delta, N); sqnrm = (float) sqnrm_d; sqnrm2 = sqnrm; /*inner CG loop */ for(j = 0; j <= max_inner_it; j++) { f32(solver_field32[0], solver_field32[2]); pro = scalar_prod_r_32(solver_field32[2], solver_field32[0], N, 1); alpha_cg = sqnrm2 / pro; assign_add_mul_r_32(x, solver_field32[2], alpha_cg, N); assign_mul_add_r_32(solver_field32[0], -alpha_cg, solver_field32[1], N); err = square_norm_32(solver_field32[0], N, 1); if(g_proc_id == g_stdio_proc && g_debug_level > 2) { printf("inner CG: %d res^2 %g\n", iter+j, err); fflush(stdout); } //if (((err <= eps_sq) && (rel_prec == 0)) || ((err <= eps_sq*squarenorm) && (rel_prec == 1))){ if((err <= mixcg_innereps*sqnrm)|| (j==max_inner_it) || ((1.3*err <= eps_sq) && (rel_prec == 0)) || ((1.3*err <= eps_sq*sourcesquarenorm) && (rel_prec == 1))) { break; } beta_cg = err / sqnrm2; assign_mul_add_r_32(solver_field32[2], beta_cg, solver_field32[0], N); stmp = solver_field32[0]; solver_field32[0] = solver_field32[1]; solver_field32[1] = stmp; sqnrm2 = err; } /* end inner CG loop */ iter += j; /* we want to apply a true double matrix with f(y,P) -> set sloppy off here*/ g_sloppy_precision_flag = 0; /* calculate defect in double precision */ assign_to_64(xhigh, x, N); add(P, P, xhigh, N); f(y, P); diff(delta, Q, y, N); sqnrm_d = square_norm(delta, N, 1); if(g_debug_level > 2 && g_proc_id == 0) { printf("mixed CG: last inner residue: %g\t\n", err); printf("mixed CG: true residue %d %g\t\n",iter, sqnrm_d); fflush(stdout); } /* here we can reset it to its initial value*/ g_sloppy_precision_flag = save_sloppy; if(((sqnrm_d <= eps_sq) && (rel_prec == 0)) || ((sqnrm_d <= eps_sq*sourcesquarenorm) && (rel_prec == 1))) { etime = gettime(); if(g_debug_level > 0 && g_proc_id == 0) { if(N != VOLUME){ /* 2 A + 2 Nc Ns + N_Count ( 2 A + 10 Nc Ns ) */ /* 2*1608.0 because the linalg is over VOLUME/2 */ flops = (2*(2*1608.0+2*3*4) + 2*3*4 + iter*(2.*(2*1608.0+2*3*4) + 10*3*4))*N/1.0e6f; printf("# mixed CG: iter: %d eps_sq: %1.4e t/s: %1.4e\n", iter, eps_sq, etime-atime); printf("# mixed CG: flopcount (for e/o tmWilson only): t/s: %1.4e mflops_local: %.1f mflops: %.1f\n", etime-atime, flops/(etime-atime), g_nproc*flops/(etime-atime)); } else{ /* 2 A + 2 Nc Ns + N_Count ( 2 A + 10 Nc Ns ) */ flops = (2*(1608.0+2*3*4) + 2*3*4 + iter*(2.*(1608.0+2*3*4) + 10*3*4))*N/1.0e6f; printf("# mixed CG: iter: %d eps_sq: %1.4e t/s: %1.4e\n", iter, eps_sq, etime-atime); printf("# mixed CG: flopcount (for non-e/o tmWilson only): t/s: %1.4e mflops_local: %.1f mflops: %.1f\n", etime-atime, flops/(etime-atime), g_nproc*flops/(etime-atime)); } } finalize_solver(solver_field, nr_sf); finalize_solver_32(solver_field32, nr_sf32); return(iter+i); } iter++; } finalize_solver(solver_field, nr_sf); finalize_solver_32(solver_field32, nr_sf32); return(-1); }
void Msap_eo(spinor * const P, spinor * const Q, const int Ncy) { int blk, ncy = 0, eo, vol; spinor * r, * a, * b; double nrm; spinor * b_even, * b_odd, * a_even, * a_odd; spinor ** solver_field = NULL; const int nr_sf = 3; /* * here it would be probably better to get the working fields as a parameter * from the calling function */ init_solver_field(&solver_field, VOLUME, nr_sf); r = solver_field[0]; a = solver_field[1]; b = solver_field[2]; vol = block_list[0].volume/2; b_even = b; b_odd = b + vol + 1; a_even = a; a_odd = a + vol + 1; for(ncy = 0; ncy < Ncy; ncy++) { /* compute the global residue */ /* this can be done more efficiently */ /* here only a naive implementation */ for(eo = 0; eo < 2; eo++) { D_psi(r, P); diff(r, Q, r, VOLUME); nrm = square_norm(r, VOLUME, 1); if(g_proc_id == 0 && g_debug_level > 1 && eo == 1) { printf("Msap: %d %1.3e\n", ncy, nrm); } /* choose the even (odd) block */ for (blk = 0; blk < nb_blocks; blk++) { if(block_list[blk].evenodd == eo) { /* get part of r corresponding to block blk into b_even and b_odd */ copy_global_to_block_eo(b_even, b_odd, r, blk); assign_mul_one_pm_imu_inv(a_even, b_even, +1., vol); Block_H_psi(&block_list[blk], a_odd, a_even, OE); /* a_odd = a_odd - b_odd */ assign_mul_add_r(a_odd, -1., b_odd, vol); mrblk(b_odd, a_odd, 3, 1.e-31, 1, vol, &Mtm_plus_block_psi, blk); Block_H_psi(&block_list[blk], b_even, b_odd, EO); mul_one_pm_imu_inv(b_even, +1., vol); /* a_even = a_even - b_even */ assign_add_mul_r(a_even, b_even, -1., vol); /* add even and odd part up to full spinor P */ add_eo_block_to_global(P, a_even, b_odd, blk); } } } } finalize_solver(solver_field, nr_sf); return; }
int mixed_cg_mms_tm_nd(spinor ** const Pup, spinor ** const Pdn, spinor * const Qup, spinor * const Qdn, solver_pm_t * solver_pm) { double eps_sq = solver_pm->squared_solver_prec; int noshifts = solver_pm->no_shifts; int rel_prec = solver_pm->rel_prec; int max_iter = solver_pm->max_iter; int check_abs, check_rel; double * shifts = solver_pm->shifts; int Nshift = noshifts; // algorithm double rr_up, rr_dn, rr, rr_old, r0r0, dAd_up, dAd_dn, dAd; if(rel_prec){ check_rel = 1; check_abs = 0; } else{ check_rel = 0; check_abs = 1; } int use_eo=1, eofactor=2; //not even-odd? if(solver_pm->sdim == VOLUME) { eofactor = 1; use_eo = 0; } int N = VOLUME/eofactor; int Vol = VOLUMEPLUSRAND/eofactor; // norm of source rr_up = square_norm(Qup, N, 1); rr_dn = square_norm(Qdn, N, 1); rr = rr_up + rr_dn; if( (g_cart_id == 0 && g_debug_level > 2)) printf("# CGMMSND_mixed: Initial mms residue: %.6e\n", rr); if(rr < 1.0e-4){ if( (g_cart_id == 0 && g_debug_level > 2)) printf("# CGMMSND_mixed: norm of source too low: falling back to double mms solver %.6e\n", rr); return(cg_mms_tm_nd(Pup, Pdn, Qup, Qdn, solver_pm)); } r0r0 = rr; // for relative precision rr_old = rr; // for the first iteration //allocate an auxiliary solver fields spinor ** sf = NULL; const int nr_sf = 6; init_solver_field(&sf, Vol, nr_sf); spinor32 ** sf32 = NULL; const int nr_sf32 = 8; init_solver_field_32(&sf32, Vol, nr_sf32); //spinor fields //we need one less than shifts, since one field is cared of by the usual cg fields init_mms_tm_nd_32(noshifts-1, Vol); // Pup/dn can be used as auxiliary field to work on, as it is not later used (could be used as initial guess at the very start) // Q_up/dn can be used as feedback, or if not, also as auxiliary field //allocate cg constants double * sigma; double * zitam1, * zita; double * alphas, * betas; double gamma; double alpham1; sigma = (double*)calloc((noshifts), sizeof(double)); zitam1 = (double*)calloc((noshifts), sizeof(double)); zita = (double*)calloc((noshifts), sizeof(double)); alphas = (double*)calloc((noshifts), sizeof(double)); betas = (double*)calloc((noshifts), sizeof(double)); spinor32 * r_up, * r_dn, * Ad_up, * Ad_dn, * x_up, * x_dn, * d_up, * d_dn; spinor * r_up_d, * r_dn_d, * x_up_d, * x_dn_d, * Ax_up_d, * Ax_dn_d; // iteration counter int j; //reliable update flag int rel_update = 0; //no of reliable updates done int no_rel_update = 0; //use reliable update flag int use_reliable = 1; double rel_delta = 1.0e-10; int trigger_shift = -1; double * res; double * res0; double * maxres; res = (double*)calloc((noshifts), sizeof(double)); res0 = (double*)calloc((noshifts), sizeof(double)); maxres = (double*)calloc((noshifts), sizeof(double)); ///////////////// // ASSIGNMENTS // ///////////////// x_up = sf32[0]; x_dn = sf32[1]; r_up = sf32[2]; r_dn = sf32[3]; d_up = sf32[4]; d_dn = sf32[5]; Ad_up = sf32[6]; Ad_dn = sf32[7]; x_up_d = sf[0]; x_dn_d = sf[1]; r_up_d = sf[2]; r_dn_d = sf[3]; Ax_up_d = sf[4]; Ax_dn_d = sf[5]; /* //matrix test spinor32 * help_low_up = sf32[0]; spinor32 * help_low_dn = sf32[1]; spinor * help_high_up = sf[0]; spinor * help_high_dn = sf[1]; assign_to_32(help_low_up, Qup, N); assign_to_32(help_low_dn, Qdn, N); assign(help_high_up, Qup, N); assign(help_high_dn, Qdn, N); double sqn_high = square_norm(help_high_up,N,1) + square_norm(help_high_dn,N,1); printf("square_norm(Q_high) = %e\n", sqn_high); float sqn_low = square_norm_32(help_low_up,N,1) + square_norm_32(help_low_dn,N,1); printf("square_norm(Q_low) = %e\n", sqn_low); solver_pm->M_ndpsi32(sf32[2], sf32[3], help_low_up, help_low_dn); solver_pm->M_ndpsi(sf[2], sf[3], help_high_up, help_high_dn); assign_to_64(sf[4], sf32[2], N); assign_to_64(sf[5], sf32[3], N); diff(sf[0], sf[4], sf[2], N); diff(sf[1], sf[5], sf[3], N); double sqnrm = square_norm(sf[0], N, 1) + square_norm(sf[1], N, 1); printf("Operator 32 test: (square_norm) / (spinor component) = %.8e\n", sqnrm/24.0/N); exit(1); */ // r(0) = b assign_to_32(r_up, Qup, N); assign_to_32(r_dn, Qdn, N); // d(0) = b assign_to_32(d_up, Qup, N); assign_to_32(d_dn, Qdn, N); maxres[0] = rr; res[0] = rr; res0[0] = rr; alphas[0] = 1.0; betas[0] = 0.0; sigma[0] = shifts[0]*shifts[0]; if(g_cart_id == 0 && g_debug_level > 2) printf("# CGMMSND_mixed: shift %d is %e\n", 0, sigma[0]); // currently only implemented for P=0 for(int im = 1; im < noshifts; im++) { maxres[im] = rr; res[im] = rr; res0[im] = rr; sigma[im] = shifts[im]*shifts[im] - sigma[0]; if(g_cart_id == 0 && g_debug_level > 2) printf("# CGMMSND_mixed: shift %d is %e\n", im, sigma[im]); // these will be the result spinor fields zero_spinor_field_32(mms_x_up[im-1], N); zero_spinor_field_32(mms_x_dn[im-1], N); assign_to_32(mms_d_up[im-1], Qup, N); assign_to_32(mms_d_dn[im-1], Qdn, N); zitam1[im] = 1.0; zita[im] = 1.0; alphas[im] = 1.0; betas[im] = 0.0; } //zero fields for solution Pup, Pdn for(int im = 0; im < noshifts; im++){ zero_spinor_field(Pup[im], N); zero_spinor_field(Pdn[im], N); } ////////// // LOOP // ////////// for (j = 0; j < max_iter; j++) { // A*d(k) solver_pm->M_ndpsi32(Ad_up, Ad_dn, d_up, d_dn); //add zero'th shift assign_add_mul_r_32(Ad_up, d_up, (float) sigma[0], N); assign_add_mul_r_32(Ad_dn, d_dn, (float) sigma[0], N); // alpha = r(k)*r(k) / d(k)*A*d(k) dAd_up = scalar_prod_r_32(d_up, Ad_up, N, 1); dAd_dn = scalar_prod_r_32(d_dn, Ad_dn, N, 1); dAd = dAd_up + dAd_dn; alpham1 = alphas[0]; alphas[0] = rr_old / dAd; // rr_old is taken from the last iteration respectively // r(k+1) assign_add_mul_r_32(r_up, Ad_up, (float) -alphas[0],N); assign_add_mul_r_32(r_dn, Ad_dn, (float) -alphas[0],N); // r(k+1)*r(k+1) rr_up = square_norm_32(r_up, N, 1); rr_dn = square_norm_32(r_dn, N, 1); rr = rr_up + rr_dn; if((g_cart_id == 0) && (g_debug_level > 2)) printf("# CGMMSND_mixed: mms iteration j = %i: rr = %.6e\n", j, rr); // aborting ?? // check wether precision is reached ... if ( ((check_abs)&&(rr <= eps_sq)) || ((check_rel)&&(rr <= eps_sq*r0r0)) ) { if ((check_rel)&&(rr <= eps_sq*r0r0)) { if((g_cart_id == 0) && (g_debug_level > 3)) printf("# CGMMSND_mixed: Reached relative solver precision of eps_rel = %.2e\n", eps_sq); } break; } // update alphas and zitas // used later for(int im = 1; im < noshifts; im++) { gamma = zita[im]*alpham1/(alphas[0]*betas[0]*(1.-zita[im]/zitam1[im]) + alpham1*(1.+sigma[im]*alphas[0])); zitam1[im] = zita[im]; zita[im] = gamma; alphas[im] = alphas[0]*zita[im]/zitam1[im]; } //check for reliable update res[0] = rr; for(int im=1; im<noshifts; im++) res[im] = rr * zita[im]; rel_update = 0; for(int im = (noshifts-1); im >= 0; im--) { if( res[im] > maxres[im] ) maxres[im] = res[im]; if( (res[im] < rel_delta*res0[im]) && (res0[im]<=maxres[im]) && (use_reliable) ) rel_update=1; if( rel_update && ( trigger_shift == -1) ) trigger_shift = im; } if(!rel_update) { // x_j(k+1) = x_j(k) + alpha_j*d_j(k) // alphas are set above assign_add_mul_r_32(x_up, d_up, (float) alphas[0], N); assign_add_mul_r_32(x_dn, d_dn, (float) alphas[0], N); for(int im = 1; im < noshifts; im++) { assign_add_mul_r_32(mms_x_up[im-1], mms_d_up[im-1], (float) alphas[im], N); assign_add_mul_r_32(mms_x_dn[im-1], mms_d_dn[im-1], (float) alphas[im], N); } // beta = r(k+1)*r(k+1) / r(k)*r(k) betas[0] = rr / rr_old; rr_old = rr; // for next iteration // d_0(k+1) = r(k+1) + beta*d_0(k) assign_mul_add_r_32(d_up, (float) betas[0], r_up, N); assign_mul_add_r_32(d_dn, (float) betas[0], r_dn, N); // d_j(k+1) = zita*r(k+1) + beta*d_j(k) for(int im = 1; im < noshifts; im++) { betas[im] = betas[0]*zita[im]*alphas[im]/(zitam1[im]*alphas[0]); assign_mul_add_mul_r_32(mms_d_up[im-1], r_up, (float) betas[im], (float) zita[im], N); assign_mul_add_mul_r_32(mms_d_dn[im-1], r_dn, (float) betas[im], (float) zita[im], N); } } else{ //reliable update if( (g_cart_id == 0) && (g_debug_level > 3) ){ printf("# CGMMSND_mixed: Shift %d with offset squared %e triggered a reliable update\n", trigger_shift, sigma[trigger_shift]); } //add low prec solutions assign_add_mul_r_32(x_up, d_up, (float) alphas[0], N); assign_add_mul_r_32(x_dn, d_dn, (float) alphas[0], N); addto_32(Pup[0], x_up, N); addto_32(Pdn[0], x_dn, N); for(int im = 1; im < noshifts; im++) { assign_add_mul_r_32(mms_x_up[im-1], mms_d_up[im-1], alphas[im], N); assign_add_mul_r_32(mms_x_dn[im-1], mms_d_dn[im-1], alphas[im], N); addto_32(Pup[im], mms_x_up[im-1], N); addto_32(Pdn[im], mms_x_dn[im-1], N); } //add low precision for shift 0 only addto_32(x_up_d, x_up, N); addto_32(x_dn_d, x_dn, N); solver_pm->M_ndpsi(Ax_up_d, Ax_dn_d, x_up_d, x_dn_d); //add zero'th shift assign_add_mul_r(Ax_up_d, x_up_d, sigma[0], N); assign_add_mul_r(Ax_dn_d, x_dn_d, sigma[0], N); diff(r_up_d, Qup, Ax_up_d, N); diff(r_dn_d, Qdn, Ax_dn_d, N); rr_up = square_norm(r_up_d, N, 1); rr_dn = square_norm(r_dn_d, N, 1); rr = rr_up + rr_dn; if ((g_cart_id == 0) && (g_debug_level > 3) ) printf("# CGMMSND_mixed: New residue after reliable update: %.6e\n", rr); //update res[im] res[0] = rr; if(res[trigger_shift] > res0[trigger_shift]){ if(g_cart_id == 0) printf("# CGMMSND_mixed: Warning: residue of shift no %d got larger after rel. update\n", trigger_shift); //if this is the zero'th shift not getting better -> no further convergence, break if(trigger_shift == 0) break; } //zero float fields zero_spinor_field_32(x_up, N); zero_spinor_field_32(x_dn, N); for(int im = 1; im < noshifts; im++) { zero_spinor_field_32(mms_x_up[im-1], N); zero_spinor_field_32(mms_x_dn[im-1], N); } //update the source assign_to_32(r_up, r_up_d, N); assign_to_32(r_dn, r_dn_d, N); betas[0] = res[0]/rr_old; rr_old = rr; // d_0(k+1) = r(k+1) + beta*d_0(k) assign_mul_add_r_32(d_up, betas[0], r_up, N); assign_mul_add_r_32(d_dn, betas[0], r_dn, N); // d_j(k+1) = r(k+1) + beta*d_j(k) for(int im = 1; im < noshifts; im++) { betas[im] = betas[0]*zita[im]*alphas[im]/(zitam1[im]*alphas[0]); assign_mul_add_mul_r_32(mms_d_up[im-1], r_up, (float) betas[im], (float) zita[im], N); assign_mul_add_mul_r_32(mms_d_dn[im-1], r_dn, (float) betas[im], (float) zita[im], N); } //new maxres for the shift that initiated the reliable update res[trigger_shift] = res[0]*zita[trigger_shift]*zita[trigger_shift]; res0[trigger_shift] = res[trigger_shift]; maxres[trigger_shift] = res[trigger_shift]; trigger_shift = -1; no_rel_update ++; } //reliable update //check if some shift is converged for(int im = 1; im < noshifts; im++) { if(j > 0 && (j % 10 == 0) && (im == noshifts-1)) { double sn = square_norm_32(mms_d_up[im-1], N, 1); sn += square_norm_32(mms_d_dn[im-1], N, 1); if(alphas[noshifts-1]*alphas[noshifts-1]*sn <= eps_sq) { noshifts--; if( (g_debug_level > 1) && (g_cart_id == 0) ) { printf("# CGMMSND_mixed: at iteration %d removed one shift, %d remaining\n", j, noshifts); } //if removed we add the latest solution vector for this shift addto_32(Pup[im], mms_x_up[im-1], N); addto_32(Pdn[im], mms_x_dn[im-1], N); } } } }//LOOP if( (g_cart_id == 0) && (g_debug_level > 1) ) printf("Final mms residue: %.6e\n", rr); //add the latest solutions for(int im = 0; im < noshifts; im++) { if(im == 0){ addto_32(Pup[0], x_up, N); addto_32(Pdn[0], x_dn, N); } else{ addto_32(Pup[im], mms_x_up[im-1], N); addto_32(Pdn[im], mms_x_dn[im-1], N); } } if(g_debug_level > 4){ if(g_cart_id == 0) printf("# CGMMSND_mixed: Checking mms result:\n"); //loop over all shifts (-> Nshift) for(int im = 0; im < Nshift; im++){ solver_pm->M_ndpsi(sf[0], sf[1], Pup[im], Pdn[im]); assign_add_mul_r(sf[0], Pup[im] , shifts[im]*shifts[im], N); assign_add_mul_r(sf[1], Pdn[im] , shifts[im]*shifts[im], N); diff(sf[2], sf[0], Qup, N); diff(sf[3], sf[1], Qdn, N); rr_up = square_norm(sf[2], N, 1); rr_dn = square_norm(sf[3], N, 1); rr = rr_up + rr_dn; if(g_cart_id == 0) printf("# CGMMSND_mixed: Shift[%d] squared residue: %e\n", im, rr); } } finalize_solver(sf, nr_sf); finalize_solver_32(sf32, nr_sf32); //free cg constants free(sigma); free(zitam1); free(zita); free(alphas); free(betas); //free reliable update stuff free(res); free(res0); free(maxres); //if not converged -> return(-1) if(j<max_iter){ return(j); } else{ return(-1); } }//
int invert_doublet_eo_quda(spinor * const Even_new_s, spinor * const Odd_new_s, spinor * const Even_new_c, spinor * const Odd_new_c, spinor * const Even_s, spinor * const Odd_s, spinor * const Even_c, spinor * const Odd_c, const double precision, const int max_iter, const int solver_flag, const int rel_prec, const int even_odd_flag, const SloppyPrecision sloppy_precision, CompressionType compression) { spinor ** solver_field = NULL; const int nr_sf = 4; init_solver_field(&solver_field, VOLUMEPLUSRAND, nr_sf); convert_eo_to_lexic(solver_field[0], Even_s, Odd_s); convert_eo_to_lexic(solver_field[1], Even_c, Odd_c); // convert_eo_to_lexic(g_spinor_field[DUM_DERI+1], Even_new, Odd_new); void *spinorIn = (void*)solver_field[0]; // source void *spinorIn_c = (void*)solver_field[1]; // charme source void *spinorOut = (void*)solver_field[2]; // solution void *spinorOut_c = (void*)solver_field[3]; // charme solution if ( rel_prec ) inv_param.residual_type = QUDA_L2_RELATIVE_RESIDUAL; else inv_param.residual_type = QUDA_L2_ABSOLUTE_RESIDUAL; inv_param.kappa = g_kappa; // IMPORTANT: use opposite TM mu-flavor since gamma5 -> -gamma5 inv_param.mu = -g_mubar /2./g_kappa; inv_param.epsilon = g_epsbar/2./g_kappa; // figure out which BC to use (theta, trivial...) set_boundary_conditions(&compression); // set the sloppy precision of the mixed prec solver set_sloppy_prec(sloppy_precision); // load gauge after setting precision _loadGaugeQuda(compression); // choose dslash type if( g_c_sw > 0.0 ) { inv_param.dslash_type = QUDA_TWISTED_CLOVER_DSLASH; inv_param.matpc_type = QUDA_MATPC_EVEN_EVEN; inv_param.solution_type = QUDA_MAT_SOLUTION; inv_param.clover_order = QUDA_PACKED_CLOVER_ORDER; inv_param.clover_coeff = g_c_sw*g_kappa; } else { inv_param.dslash_type = QUDA_TWISTED_MASS_DSLASH; inv_param.matpc_type = QUDA_MATPC_EVEN_EVEN_ASYMMETRIC; inv_param.solution_type = QUDA_MAT_SOLUTION; } // choose solver if(solver_flag == BICGSTAB) { if(g_proc_id == 0) {printf("# QUDA: Using BiCGstab!\n"); fflush(stdout);} inv_param.inv_type = QUDA_BICGSTAB_INVERTER; } else { /* Here we invert the hermitean operator squared */ inv_param.inv_type = QUDA_CG_INVERTER; if(g_proc_id == 0) { printf("# QUDA: Using mixed precision CG!\n"); printf("# QUDA: mu = %f, kappa = %f\n", g_mu/2./g_kappa, g_kappa); fflush(stdout); } } if( even_odd_flag ) { inv_param.solve_type = QUDA_NORMOP_PC_SOLVE; if(g_proc_id == 0) printf("# QUDA: Using preconditioning!\n"); } else { inv_param.solve_type = QUDA_NORMOP_SOLVE; if(g_proc_id == 0) printf("# QUDA: Not using preconditioning!\n"); } inv_param.tol = sqrt(precision)*0.25; inv_param.maxiter = max_iter; inv_param.twist_flavor = QUDA_TWIST_NONDEG_DOUBLET; inv_param.Ls = 2; // NULL pointers to host fields to force // construction instead of download of the clover field: if( g_c_sw > 0.0 ) loadCloverQuda(NULL, NULL, &inv_param); // reorder spinor reorder_spinor_toQuda( (double*)spinorIn, inv_param.cpu_prec, 1, (double*)spinorIn_c ); // perform the inversion invertQuda(spinorOut, spinorIn, &inv_param); if( inv_param.verbosity == QUDA_VERBOSE ) if(g_proc_id == 0) printf("# QUDA: Device memory used: Spinor: %f GiB, Gauge: %f GiB, Clover: %f GiB\n", inv_param.spinorGiB, gauge_param.gaugeGiB, inv_param.cloverGiB); if( inv_param.verbosity > QUDA_SILENT ) if(g_proc_id == 0) printf("# QUDA: Done: %i iter / %g secs = %g Gflops\n", inv_param.iter, inv_param.secs, inv_param.gflops/inv_param.secs); // number of CG iterations int iteration = inv_param.iter; // reorder spinor reorder_spinor_fromQuda( (double*)spinorIn, inv_param.cpu_prec, 1, (double*)spinorIn_c ); reorder_spinor_fromQuda( (double*)spinorOut, inv_param.cpu_prec, 1, (double*)spinorOut_c ); convert_lexic_to_eo(Even_s, Odd_s, solver_field[0]); convert_lexic_to_eo(Even_c, Odd_c, solver_field[1]); convert_lexic_to_eo(Even_new_s, Odd_new_s, solver_field[2]); convert_lexic_to_eo(Even_new_c, Odd_new_c, solver_field[3]); finalize_solver(solver_field, nr_sf); freeGaugeQuda(); if(iteration >= max_iter) return(-1); return(iteration); }
/* P output = solution , Q input = source */ int cg_her_nd(spinor * const P_up,spinor * P_dn, spinor * const Q_up, spinor * const Q_dn, const int max_iter, double eps_sq, const int rel_prec, const int N, matrix_mult_nd f) { double normsp, normsq, pro, err, alpha_cg, beta_cg, squarenorm; int iteration; double err1, err2; spinor ** up_field = NULL; spinor ** dn_field = NULL; const int nr_sf = 5; /* do we really need so many fields??? */ init_solver_field(&up_field, VOLUMEPLUSRAND, nr_sf); init_solver_field(&dn_field, VOLUMEPLUSRAND, nr_sf); squarenorm = square_norm(Q_up, N, 1); squarenorm+= square_norm(Q_dn, N, 1); /* !!!! INITIALIZATION !!!! */ assign(up_field[0], P_up, N); assign(dn_field[0], P_dn, N); /* (r_0,r_0) = normsq */ normsp =square_norm(P_up, N, 1); normsp+=square_norm(P_dn, N, 1); /* assign(up_field[5], Q_up, N); */ /* assign(dn_field[5], Q_dn, N); */ /* initialize residue r and search vector p */ if(normsp==0){ /* if a starting solution vector equal to zero is chosen */ assign(up_field[1], Q_up, N); assign(dn_field[1], Q_dn, N); assign(up_field[2], Q_up, N); assign(dn_field[2], Q_dn, N); normsq =square_norm(Q_up, N, 1); normsq+=square_norm(Q_dn, N, 1); } else { /* if a starting solution vector different from zero is chosen */ f(up_field[3],dn_field[3], up_field[0],dn_field[0]); diff(up_field[1], Q_up, up_field[3], N); diff(dn_field[1], Q_dn, dn_field[3], N); assign(up_field[2], up_field[1], N); assign(dn_field[2], dn_field[1], N); normsq =square_norm(up_field[2], N, 1); normsq+=square_norm(dn_field[2], N, 1); } /* main loop */ for(iteration=0;iteration<max_iter;iteration++){ f(up_field[4],dn_field[4], up_field[2],dn_field[2]); pro =scalar_prod_r(up_field[2], up_field[4], N, 1); pro+=scalar_prod_r(dn_field[2], dn_field[4], N, 1); /* Compute alpha_cg(i+1) */ alpha_cg=normsq/pro; /* Compute x_(i+1) = x_i + alpha_cg(i+1) p_i */ assign_add_mul_r(up_field[0], up_field[2], alpha_cg, N); assign_add_mul_r(dn_field[0], dn_field[2], alpha_cg, N); /* Compute r_(i+1) = r_i - alpha_cg(i+1) Qp_i */ assign_add_mul_r(up_field[1], up_field[4], -alpha_cg, N); assign_add_mul_r(dn_field[1], dn_field[4], -alpha_cg, N); /* Check whether the precision is reached ... */ err1 =square_norm(up_field[1], N, 1); err2 =square_norm(dn_field[1], N, 1); err = err1 + err2; if(g_debug_level > 1 && g_proc_id == g_stdio_proc) { printf("cg_her_nd : i = %d esqr %e = %e + %e \n",iteration,err, err1, err2); fflush( stdout); } if(((err <= eps_sq) && (rel_prec == 0)) || ((err <= eps_sq*squarenorm) && (rel_prec == 1))) { assign(P_up, up_field[0], N); assign(P_dn, dn_field[0], N); g_sloppy_precision = 0; finalize_solver(up_field, nr_sf); finalize_solver(dn_field, nr_sf); return(iteration+1); } #ifdef _USE_HALFSPINOR if(((err*err <= eps_sq) && (rel_prec == 0)) || ((err*err <= eps_sq*squarenorm) && (rel_prec == 1))) { g_sloppy_precision = 1; if(g_debug_level > 2 && g_proc_id == g_stdio_proc) { printf("sloppy precision on\n"); fflush( stdout); } } #endif /* Compute beta_cg(i+1) Compute p_(i+1) = r_i+1 + beta_(i+1) p_i */ beta_cg=err/normsq; assign_mul_add_r(up_field[2], beta_cg, up_field[1], N); assign_mul_add_r(dn_field[2], beta_cg, dn_field[1], N); normsq=err; } assign(P_up, up_field[0], N); assign(P_dn, dn_field[0], N); g_sloppy_precision = 0; finalize_solver(up_field, nr_sf); finalize_solver(dn_field, nr_sf); return(-1); }
int gmres_dr(spinor * const P,spinor * const Q, const int m, const int nr_ev, const int max_restarts, const double eps_sq, const int rel_prec, const int N, matrix_mult f){ int restart=0, i, j, k, l; double beta, eps, norm, beta2=0.; complex *lswork = NULL; int lwork; complex tmp1, tmp2; int info=0; int _m = m, mp1 = m+1, np1 = nr_ev+1, ne = nr_ev, V2 = 12*(VOLUMEPLUSRAND)/2, _N = 12*N; spinor ** solver_field = NULL; const int nr_sf = 3; if(N == VOLUME) { init_solver_field(&solver_field, VOLUMEPLUSRAND, nr_sf); } else { init_solver_field(&solver_field, VOLUMEPLUSRAND/2, nr_sf); } double err=0.; spinor * r0, * x0; cmone.re = -1.; cmone.im=0.; cpone.re = 1.; cpone.im=0.; czero.re = 0.; czero.im = 0.; r0 = solver_field[0]; x0 = solver_field[2]; eps=sqrt(eps_sq); init_gmres_dr(m, (VOLUMEPLUSRAND)); norm = sqrt(square_norm(Q, N, 1)); assign(x0, P, N); /* first normal GMRES cycle */ /* r_0=Q-AP (b=Q, x+0=P) */ f(r0, x0); diff(r0, Q, r0, N); /* v_0=r_0/||r_0|| */ alpha[0].re=sqrt(square_norm(r0, N, 1)); err = alpha[0].re; if(g_proc_id == g_stdio_proc && g_debug_level > 0){ printf("%d\t%e true residue\n", restart*m, alpha[0].re*alpha[0].re); fflush(stdout); } if(alpha[0].re==0.){ assign(P, x0, N); finalize_solver(solver_field, nr_sf); return(restart*m); } mul_r(V[0], 1./alpha[0].re, r0, N); for(j = 0; j < m; j++){ /* solver_field[0]=A*v_j */ /* Set h_ij and omega_j */ /* solver_field[1] <- omega_j */ f(solver_field[1], V[j]); /* assign(solver_field[1], solver_field[0], N); */ for(i = 0; i <= j; i++){ H[i][j] = scalar_prod(V[i], solver_field[1], N, 1); /* G, work and work2 are in Fortran storage: columns first */ G[j][i] = H[i][j]; work2[j][i] = H[i][j]; work[i][j].re = H[i][j].re; work[i][j].im = -H[i][j].im; assign_diff_mul(solver_field[1], V[i], H[i][j], N); } _complex_set(H[j+1][j], sqrt(square_norm(solver_field[1], N, 1)), 0.); G[j][j+1] = H[j+1][j]; work2[j][j+1] = H[j+1][j]; work[j+1][j].re = H[j+1][j].re; work[j+1][j].im = -H[j+1][j].im; beta2 = H[j+1][j].re*H[j+1][j].re; for(i = 0; i < j; i++){ tmp1 = H[i][j]; tmp2 = H[i+1][j]; _mult_real(H[i][j], tmp2, s[i]); _add_assign_complex_conj(H[i][j], c[i], tmp1); _mult_real(H[i+1][j], tmp1, s[i]); _diff_assign_complex(H[i+1][j], c[i], tmp2); } /* Set beta, s, c, alpha[j],[j+1] */ beta = sqrt(_complex_square_norm(H[j][j]) + _complex_square_norm(H[j+1][j])); s[j] = H[j+1][j].re / beta; _mult_real(c[j], H[j][j], 1./beta); _complex_set(H[j][j], beta, 0.); _mult_real(alpha[j+1], alpha[j], s[j]); tmp1 = alpha[j]; _mult_assign_complex_conj(alpha[j], c[j], tmp1); /* precision reached? */ if(g_proc_id == g_stdio_proc && g_debug_level > 0){ printf("%d\t%e residue\n", restart*m+j, alpha[j+1].re*alpha[j+1].re); fflush(stdout); } if(((alpha[j+1].re <= eps) && (rel_prec == 0)) || ((alpha[j+1].re <= eps*norm) && (rel_prec == 1))){ _mult_real(alpha[j], alpha[j], 1./H[j][j].re); assign_add_mul(x0, V[j], alpha[j], N); for(i = j-1; i >= 0; i--){ for(k = i+1; k <= j; k++){ _mult_assign_complex(tmp1, H[i][k], alpha[k]); /* alpha[i] -= tmp1 */ _diff_complex(alpha[i], tmp1); } _mult_real(alpha[i], alpha[i], 1./H[i][i].re); assign_add_mul(x0, V[i], alpha[i], N); } for(i = 0; i < m; i++){ alpha[i].im = 0.; } assign(P, x0, N); finalize_solver(solver_field, nr_sf); return(restart*m+j); } /* if not */ else { mul_r(V[(j+1)], 1./H[j+1][j].re, solver_field[1], N); } } j=m-1; /* prepare for restart */ _mult_real(alpha[j], alpha[j], 1./H[j][j].re); assign_add_mul(x0, V[j], alpha[j], N); if(g_proc_id == 0 && g_debug_level > 3) { printf("alpha: %e %e\n", alpha[j].re, alpha[j].im); } for(i = j-1; i >= 0; i--){ for(k = i+1; k <= j; k++){ _mult_assign_complex(tmp1, H[i][k], alpha[k]); _diff_complex(alpha[i], tmp1); } _mult_real(alpha[i], alpha[i], 1./H[i][i].re); if(g_proc_id == 0 && g_debug_level > 3) { printf("alpha: %e %e\n", alpha[i].re, alpha[i].im); } assign_add_mul(x0, V[i], alpha[i], N); } /* This produces c=V_m+1*r0 */ for(i = 0; i < mp1; i++) { c[i] = scalar_prod(V[i], r0, N, 1); if(g_proc_id == 0 && g_debug_level > 3) { printf("c: %e %e err = %e\n", c[i].re, c[i].im, err); } } for(restart = 1; restart < max_restarts; restart++) { /* compute c-\bar H \alpha */ _FT(zgemv) ("N", &mp1, &_m, &cmone, G[0], &mp1, alpha, &one, &cpone, c, &one, 1); err = sqrt(short_scalar_prod(c, c, mp1).re); if(g_proc_id == 0 && g_debug_level > 0) { printf("%d\t %e short residue\n", m*restart, err*err); } /* Compute new residual r0 */ /* r_0=Q-AP (b=Q, x+0=P) */ if(g_debug_level > 0) { f(r0, x0); diff(r0, Q, r0, N); tmp1.im=sqrt(square_norm(r0, N, 1)); if(g_proc_id == g_stdio_proc){ printf("%d\t%e true residue\n", m*restart, tmp1.im*tmp1.im); fflush(stdout); } } mul(r0, c[0], V[0], N); for(i = 1; i < mp1; i++) { assign_add_mul(r0, V[i], c[i], N); } if(g_debug_level > 3) { tmp1.im=sqrt(square_norm(r0, N, 1)); if(g_proc_id == g_stdio_proc){ printf("%d\t%e residue\n", m*restart, tmp1.im*tmp1.im); fflush(stdout); } } /* Stop if satisfied */ if(err < eps){ assign(P, x0, N); finalize_solver(solver_field, nr_sf); return(restart*m); } /* Prepare to compute harmonic Ritz pairs */ for(i = 0; i < m-1; i++){ alpha[i].re = 0.; alpha[i].im = 0.; } alpha[m-1].re = 1.; alpha[m-1].im = 0.; _FT(zgesv) (&_m, &one, work[0], &mp1, idx, alpha, &_m, &info); for(i = 0; i < m; i++) { G[m-1][i].re += (beta2*alpha[idx[i]-1].re); G[m-1][i].im += (beta2*alpha[idx[i]-1].im); } if(g_proc_id == 0 && g_debug_level > 3){ printf("zgesv returned info = %d, c[m-1]= %e, %e , idx[m-1]=%d\n", info, alpha[idx[m-1]-1].re, alpha[idx[m-1]-1].im, idx[m-1]); } /* c - \bar H * d -> c */ /* G contains H + \beta^2 H^-He_n e_n^H */ /* Compute harmonic Ritz pairs */ diagonalise_general_matrix(m, G[0], mp1, alpha, evalues); for(i = 0; i < m; i++) { sortarray[i] = _complex_square_norm(evalues[i]); idx[i] = i; } quicksort(m, sortarray, idx); if(g_proc_id == g_stdio_proc && g_debug_level > 1) { for(i = 0; i < m; i++) { printf("# Evalues %d %e %e \n", i, evalues[idx[i]].re, evalues[idx[i]].im); } fflush(stdout); } /* Copy the first nr_ev eigenvectors to work */ for(i = 0; i < ne; i++) { for(l = 0; l < m; l++) { work[i][l] = G[idx[i]][l]; } } /* Orthonormalize them */ for(i = 0; i < ne; i++) { work[i][m].re = 0.; work[i][m].im = 0.; short_ModifiedGS(work[i], m, i, work[0], mp1); } /* Orthonormalize c - \bar H d to work */ short_ModifiedGS(c, m+1, ne, work[0], mp1); for(i = 0; i < mp1; i++) { work[nr_ev][i] = c[i]; } /* Now compute \bar H = P^T_k+1 \bar H_m P_k */ for(i = 0; i < mp1; i++) { for(l = 0; l < mp1; l++) { H[i][l].re = 0.; H[i][l].im = 0.; } } _FT(zgemm) ("N", "N", &mp1, &ne, &_m, &cpone, work2[0], &mp1, work[0], &mp1, &czero, G[0], &mp1, 1, 1); _FT(zgemm) ("C", "N", &np1, &ne , &mp1, &cpone, work[0], &mp1, G[0], &mp1, &czero, H[0], &mp1, 1, 1); if(g_debug_level > 3) { for(i = 0; i < ne+1; i++) { for(l = 0; l < ne+1; l++) { if(g_proc_id == 0) { printf("(g[%d], g[%d]) = %e, %e\n", i, l, short_scalar_prod(work[i], work[l], m+1).re, short_scalar_prod(work[i], work[l], m+1).im); printf("(g[%d], g[%d]) = %e, %e\n", l, i, short_scalar_prod(work[l], work[i], m+1).re, short_scalar_prod(work[l], work[i], m+1).im); } } } } /* V_k+1 = V_m+1 P_k+1 */ /* _FT(zgemm) ("N", "N", &_N, &np1, &mp1, &cpone, (complex*)V[0], &V2, work[0], &mp1, &czero, (complex*)Z[0], &V2, 1, 1); */ for(l = 0; l < np1; l++) { mul(Z[l], work[l][0], V[0], N); for(i = 1; i < mp1; i++) { assign_add_mul(Z[l], V[i], work[l][i], N); } } /* copy back to V */ for(i = 0; i < np1; i++) { assign(V[i], Z[i], N); } /* Reorthogonalise v_nr_ev */ ModifiedGS((complex*)V[nr_ev], _N, nr_ev, (complex*)V[0], V2); if(g_debug_level > 3) { for(i = 0; i < np1; i++) { for(l = 0; l < np1; l++) { tmp1 = scalar_prod(V[l], V[i], N, 1); if(g_proc_id == 0) { printf("(V[%d], V[%d]) = %e %e %d %d %d %d %d %d %e %e\n", l, i, tmp1.re, tmp1.im, np1, mp1, ne, _m, _N, V2, H[l][i].re, H[l][i].im); } } } } /* Copy the content of H to work, work2 and G */ for(i=0; i < mp1; i++) { for(l = 0; l < mp1; l++) { G[i][l] = H[i][l]; work2[i][l] = H[i][l]; work[l][i].re = H[i][l].re; work[l][i].im = -H[i][l].im; } } for(j = ne; j < m; j++) { /* solver_field[0]=A*v_j */ f(solver_field[1], V[j]); /* Set h_ij and omega_j */ /* solver_field[1] <- omega_j */ /* assign(solver_field[1], solver_field[0], N); */ for(i = 0; i <= j; i++){ H[j][i] = scalar_prod(V[i], solver_field[1], N, 1); /* H, G, work and work2 are now all in Fortran storage: columns first */ G[j][i] = H[j][i]; work2[j][i] = H[j][i]; work[i][j].re = H[j][i].re; work[i][j].im = -H[j][i].im; assign_diff_mul(solver_field[1], V[i], H[j][i], N); } beta2 = square_norm(solver_field[1], N, 1); _complex_set(H[j][j+1], sqrt(beta2), 0.); G[j][j+1] = H[j][j+1]; work2[j][j+1] = H[j][j+1]; work[j+1][j].re = H[j][j+1].re; work[j+1][j].im = -H[j][j+1].im; mul_r(V[(j+1)], 1./H[j][j+1].re, solver_field[1], N); } /* Solve the least square problem for alpha*/ /* This produces c=V_m+1*r0 */ for(i = 0; i < mp1; i++) { c[i] = scalar_prod(V[i], r0, N, 1); alpha[i] = c[i]; if(g_proc_id == 0 && g_debug_level > 3) { printf("c: %e %e err = %e\n", c[i].re, c[i].im, err); } } if(lswork == NULL) { lwork = -1; _FT(zgels) ("N", &mp1, &_m, &one, H[0], &mp1, alpha, &mp1, &tmp1, &lwork, &info, 1); lwork = (int)tmp1.re; lswork = (complex*)malloc(lwork*sizeof(complex)); } _FT(zgels) ("N", &mp1, &_m, &one, H[0], &mp1, alpha, &mp1, lswork, &lwork, &info, 1); if(g_proc_id == 0 && g_debug_level > 3) { printf("zgels returned info = %d\n", info); fflush(stdout); } /* Compute the new solution vector */ for(i = 0; i < m; i++){ if(g_proc_id == 0 && g_debug_level > 3) { printf("alpha: %e %e\n", alpha[i].re, alpha[i].im); } assign_add_mul(x0, V[i], alpha[i], N); } } /* If maximal number of restart is reached */ assign(P, x0, N); finalize_solver(solver_field, nr_sf); return(-1); }
int bicgstabell(spinor * const x0, spinor * const b, const int max_iter, double eps_sq, const int rel_prec, const int _l, const int N, matrix_mult f) { double err; int i, j, k, l; double rho0, rho1, beta, alpha, omega, gamma0 = 0., squarenorm; spinor * r[5], * u[5], * r0_tilde, * x; double tau[5][5], gamma[25], gammap[25], gammapp[25], sigma[25]; spinor ** solver_field = NULL; const int nr_sf = 2*(_l+1)+2; l = _l; k = -l; if(N == VOLUME) { init_solver_field(&solver_field, VOLUMEPLUSRAND, nr_sf); } else { init_solver_field(&solver_field, VOLUMEPLUSRAND/2, nr_sf); } r0_tilde = solver_field[0]; for(i = 0; i <= l; i++){ r[i] = solver_field[2+2*i]; u[i] = solver_field[3+2*i]; } x = x0; assign(u[0], b, N); f(r0_tilde, x); diff(r[0], u[0], r0_tilde, N); zero_spinor_field(solver_field[1], N); assign(r0_tilde, r[0], N); squarenorm = square_norm(b, N, 1); rho0 = 1.; alpha = 0.; omega = 1.; err = square_norm(r0_tilde, N, 1); while( k < max_iter && (((err > eps_sq) && (rel_prec == 0)) || ((err > eps_sq*squarenorm) && (rel_prec == 1)) )) { k+=l; /* The BiCG part */ rho0 *= -omega; for(j = 0; j < l; j++) { rho1 = scalar_prod_r(r[j], r0_tilde, N, 1); beta = (rho1/rho0); beta *= alpha; rho0 = rho1; for(i = 0; i <= j; i++) { /* u_i = r_i - \beta u_i */ assign_mul_add_r(u[i], -beta, r[i], N); } f(u[j+1], u[j]); gamma0 = scalar_prod_r(u[j+1], r0_tilde, N, 1); alpha = rho0/gamma0; /* r_i = r_i - \alpha u_{i+1} */ for(i = 0; i <= j; i++) { assign_add_mul_r(r[i], u[i+1], -alpha, N); } f(r[j+1], r[j]); /* x = x + \alpha u_0 */ assign_add_mul_r(x, u[0], alpha, N); err = square_norm(r[j+1], N, 1); if(g_proc_id == 0 && g_debug_level > 1) {printf("%d %d err = %e\n", k, j, err);fflush(stdout);} } /* The MR part */ for(j = 1; j <= l; j++){ for(i = 1; i < j; i++){ tau[i][j] = scalar_prod_r(r[j], r[i], N, 1)/sigma[i]; assign_add_mul_r(r[j], r[i], -tau[i][j], N); } sigma[j] = scalar_prod_r(r[j], r[j], N, 1); gammap[j] = scalar_prod_r(r[0], r[j], N, 1)/sigma[j]; } gamma[l] = gammap[l]; omega = gamma[l]; for(j = l-1; j > 0; j--) { gamma[j] = gammap[j]; for(i = j+1; i <= l; i++) { gamma[j] -= (tau[j][i]*gamma[i]); } } for(j = 1; j < l; j++) { gammapp[j] = gamma[j+1]; for(i = j+1; i < l; i++){ gammapp[j] += (tau[j][i]*gamma[i+1]); } } assign_add_mul_r(x, r[0], gamma[1], N); assign_add_mul_r(r[0], r[l], -gammap[l], N); for(j = 1; j < l; j++){ assign_add_mul_r(x, r[j], gammapp[j], N); assign_add_mul_r(r[0], r[j], -gammap[j], N); } assign_add_mul_r(u[0], u[l], -gamma[l], N); for(j = 1; j < l; j++){ assign_add_mul_r(u[0], u[j], -gamma[j], N); } err = square_norm(r[0], N, 1); if(g_proc_id == 0 && g_debug_level > 0){ printf(" BiCGstabell iterated %d %d, %e rho0 = %e, alpha = %e, gamma0= %e\n", l, k, err, rho0, alpha, gamma0); fflush( stdout ); } } finalize_solver(solver_field, nr_sf); if(k == max_iter) return(-1); return(k); }
void Msap_eo_old(spinor * const P, spinor * const Q, const int Ncy, const int Niter) { int blk, ncy = 0, eo, vol, vols; spinor * r, * a, * b, * c; double nrm; double musave = g_mu; double kappasave = g_kappa; spinor * b_even, * b_odd, * a_even, * a_odd; spinor ** solver_field = NULL; // also get space for mrblk! 6 = 3+3 const int nr_sf = 6; if(kappa_dflgen > 0) { g_kappa = kappa_dfl; } if(mu_dflgen > -10) { g_mu = mu_dfl; // make sure the sign is correct! if(g_mu*musave < 0) g_mu *= -1.; } boundary(g_kappa); /* * here it would be probably better to get the working fields as a parameter * from the calling function */ vols = block_list[0].volume/2+block_list[0].spinpad; vol = block_list[0].volume/2; init_solver_field(&solver_field, nb_blocks*2*vols, nr_sf); r = solver_field[0]; a = solver_field[1]; b = solver_field[2]; for(ncy = 0; ncy < Ncy; ncy++) { /* compute the global residue */ /* this can be done more efficiently */ /* here only a naive implementation */ for(eo = 0; eo < 2; eo++) { D_psi(r, P); diff(r, Q, r, VOLUME); nrm = square_norm(r, VOLUME, 1); if(g_proc_id == 0 && g_debug_level > 2 && eo == 0) { printf("Msap_eo: %d %1.3e mu = %e\n", ncy, nrm, g_mu/2./g_kappa); fflush(stdout); } /* choose the even (odd) block */ // rely on nested parallelism // #ifdef TM_USE_OMP # pragma omp parallel for private (a_even, a_odd, b_even, b_odd, c) #endif for (blk = 0; blk < nb_blocks; blk++) { b_even = b + blk*2*vols; b_odd = b +blk*2*vols + vols; a_even = a + blk*2*vols; a_odd = a + blk*2*vols + vols; c = solver_field[3] + blk*vols; if(block_list[blk].evenodd == eo) { /* get part of r corresponding to block blk into b_even and b_odd */ copy_global_to_block_eo(b_even, b_odd, r, blk); if(g_c_sw > 0) { assign_mul_one_sw_pm_imu_inv_block(EE, a_even, b_even, g_mu, &block_list[blk]); Block_H_psi(&block_list[blk], a_odd, a_even, OE); /* a_odd = b_odd - a_odd */ diff(a_odd, b_odd, a_odd, vol); mrblk(b_odd, a_odd, solver_field[3] + blk*2*3*vols, Niter, 1.e-31, 1, vol, &Msw_plus_block_psi, blk); Block_H_psi(&block_list[blk], b_even, b_odd, EO); assign(c, b_even, vol); assign_mul_one_sw_pm_imu_inv_block(EE, b_even, c, g_mu, &block_list[blk]); } else { assign_mul_one_pm_imu_inv(a_even, b_even, +1., vol); Block_H_psi(&block_list[blk], a_odd, a_even, OE); /* a_odd = b_odd - a_odd */ diff(a_odd, b_odd, a_odd, vol); mrblk(b_odd, a_odd, solver_field[3] + blk*2*3*vols, Niter, 1.e-31, 1, vol, &Mtm_plus_block_psi, blk); Block_H_psi(&block_list[blk], b_even, b_odd, EO); mul_one_pm_imu_inv(b_even, +1., vol); } /* a_even = a_even - b_even */ diff(a_even, a_even, b_even, vol); /* add even and odd part up to full spinor P */ add_eo_block_to_global(P, a_even, b_odd, blk); } } } } finalize_solver(solver_field, nr_sf); g_mu = musave; g_kappa = kappasave; boundary(g_kappa); return; }
/* P output = solution , Q input = source */ int cg_mms_tm(spinor * const P, spinor * const Q, const int max_iter, double eps_sq, const int rel_prec, const int N, matrix_mult f) { static double normsq, pro, err, alpha_cg = 1., beta_cg = 0., squarenorm; int iteration, im, append = 0; char filename[100]; static double gamma, alpham1; int const cg_mms_default_precision = 32; double tmp_mu = g_mu; WRITER * writer = NULL; paramsInverterInfo *inverterInfo = NULL; paramsPropagatorFormat *propagatorFormat = NULL; spinor * temp_save; //used to save all the masses spinor ** solver_field = NULL; const int nr_sf = 5; init_solver_field(&solver_field, VOLUMEPLUSRAND, nr_sf); init_mms_tm(g_no_extra_masses); /* currently only implemented for P=0 */ zero_spinor_field(P, N); /* Value of the bare MMS-masses (\mu^2 - \mu_0^2) */ for(im = 0; im < g_no_extra_masses; im++) { sigma[im] = g_extra_masses[im]*g_extra_masses[im] - g_mu*g_mu; assign(xs_mms_solver[im], P, N); assign(ps_mms_solver[im], Q, N); zitam1[im] = 1.0; zita[im] = 1.0; alphas[im] = 1.0; betas[im] = 0.0; } squarenorm = square_norm(Q, N, 1); assign(solver_field[0], P, N); /* normsp = square_norm(P, N, 1); */ /* initialize residue r and search vector p */ /* if(normsp == 0){ */ /* currently only implemented for P=0 */ if(1) { /* if a starting solution vector equal to zero is chosen */ assign(solver_field[1], Q, N); assign(solver_field[2], Q, N); normsq = square_norm(Q, N, 1); } else{ /* if a starting solution vector different from zero is chosen */ f(solver_field[3], solver_field[0]); diff(solver_field[1], Q, solver_field[3], N); assign(solver_field[2], solver_field[1], N); normsq = square_norm(solver_field[2], N, 1); } /* main loop */ for(iteration = 0; iteration < max_iter; iteration++) { /* Q^2*p and then (p,Q^2*p) */ f(solver_field[4], solver_field[2]); pro = scalar_prod_r(solver_field[2], solver_field[4], N, 1); /* For the update of the coeff. of the shifted pol. we need alpha_cg(i-1) and alpha_cg(i). This is the reason why we need this double definition of alpha */ alpham1 = alpha_cg; /* Compute alpha_cg(i+1) */ alpha_cg = normsq/pro; for(im = 0; im < g_no_extra_masses; im++) { /* Now gamma is a temp variable that corresponds to zita(i+1) */ gamma = zita[im]*alpham1/(alpha_cg*beta_cg*(1.-zita[im]/zitam1[im]) + alpham1*(1.+sigma[im]*alpha_cg)); /* Now zita(i-1) is put equal to the old zita(i) */ zitam1[im] = zita[im]; /* Now zita(i+1) is updated */ zita[im] = gamma; /* Update of alphas(i) = alpha_cg(i)*zita(i+1)/zita(i) */ alphas[im] = alpha_cg*zita[im]/zitam1[im]; /* Compute xs(i+1) = xs(i) + alphas(i)*ps(i) */ assign_add_mul_r(xs_mms_solver[im], ps_mms_solver[im], alphas[im], N); } /* Compute x_(i+1) = x_i + alpha_cg(i+1) p_i */ assign_add_mul_r(solver_field[0], solver_field[2], alpha_cg, N); /* Compute r_(i+1) = r_i - alpha_cg(i+1) Qp_i */ assign_add_mul_r(solver_field[1], solver_field[4], -alpha_cg, N); /* Check whether the precision eps_sq is reached */ err = square_norm(solver_field[1], N, 1); if(g_debug_level > 2 && g_proc_id == g_stdio_proc) { printf("CGMMS iteration: %d residue: %g\n", iteration, err); fflush( stdout ); } if( ((err <= eps_sq) && (rel_prec == 0)) || ((err <= eps_sq*squarenorm) && (rel_prec == 1)) ) { assign(P, solver_field[0], N); f(solver_field[2], P); diff(solver_field[3], solver_field[2], Q, N); err = square_norm(solver_field[3], N, 1); if(g_debug_level > 0 && g_proc_id == g_stdio_proc) { printf("# CG MMS true residue at final iteration (%d) was %g.\n", iteration, err); fflush( stdout); } g_sloppy_precision = 0; g_mu = tmp_mu; /* save all the results of (Q^dagger Q)^(-1) \gamma_5 \phi */ /* here ... */ /* when im == -1 save the base mass*/ for(im = -1; im < g_no_extra_masses; im++) { if(im==-1) { temp_save=solver_field[0]; } else { temp_save=xs_mms_solver[im]; } if(SourceInfo.type != 1) { if (PropInfo.splitted) { sprintf(filename, "%s.%.4d.%.2d.%.2d.cgmms.%.2d.inverted", SourceInfo.basename, SourceInfo.nstore, SourceInfo.t, SourceInfo.ix, im+1); } else { sprintf(filename, "%s.%.4d.%.2d.cgmms.%.2d.inverted", SourceInfo.basename, SourceInfo.nstore, SourceInfo.t, im+1); } } else { sprintf(filename, "%s.%.4d.%.5d.cgmms.%.2d.0", SourceInfo.basename, SourceInfo.nstore, SourceInfo.sample, im+1); } if(g_kappa != 0) { mul_r(temp_save, (2*g_kappa)*(2*g_kappa), temp_save, N); } append = !PropInfo.splitted; construct_writer(&writer, filename, append); if (PropInfo.splitted || SourceInfo.ix == index_start) { //Create the inverter info NOTE: always set to TWILSON=12 and 1 flavour (to be adjusted) inverterInfo = construct_paramsInverterInfo(err, iteration+1, 12, 1); if (im == -1) { inverterInfo->cgmms_mass = inverterInfo->mu; } else { inverterInfo->cgmms_mass = g_extra_masses[im]/(2 * inverterInfo->kappa); } write_spinor_info(writer, PropInfo.format, inverterInfo, append); //Create the propagatorFormat NOTE: always set to 1 flavour (to be adjusted) propagatorFormat = construct_paramsPropagatorFormat(cg_mms_default_precision, 1); write_propagator_format(writer, propagatorFormat); free(inverterInfo); free(propagatorFormat); } convert_lexic_to_eo(solver_field[2], solver_field[1], temp_save); write_spinor(writer, &solver_field[2], &solver_field[1], 1, 32); destruct_writer(writer); } finalize_solver(solver_field, nr_sf); return(iteration+1); } /* Compute beta_cg(i+1) = (r(i+1),r(i+1))/(r(i),r(i)) Compute p(i+1) = r(i+1) + beta(i+1)*p(i) */ beta_cg = err/normsq; assign_mul_add_r(solver_field[2], beta_cg, solver_field[1], N); normsq = err; /* Compute betas(i+1) = beta_cg(i)*(zita(i+1)*alphas(i))/(zita(i)*alpha_cg(i)) Compute ps(i+1) = zita(i+1)*r(i+1) + betas(i+1)*ps(i) */ for(im = 0; im < g_no_extra_masses; im++) { betas[im] = beta_cg*zita[im]*alphas[im]/(zitam1[im]*alpha_cg); assign_mul_add_mul_r(ps_mms_solver[im], solver_field[1], betas[im], zita[im], N); } } assign(P, solver_field[0], N); g_sloppy_precision = 0; finalize_solver(solver_field, nr_sf); return(-1); }
int gmres(spinor * const P,spinor * const Q, const int m, const int max_restarts, const double eps_sq, const int rel_prec, const int N, const int parallel, matrix_mult f){ int restart, i, j, k; double beta, eps, norm; complex tmp1, tmp2; spinor ** solver_field = NULL; const int nr_sf = 3; if(N == VOLUME) { init_solver_field(&solver_field, VOLUMEPLUSRAND, nr_sf); } else { init_solver_field(&solver_field, VOLUMEPLUSRAND/2, nr_sf); } eps=sqrt(eps_sq); init_gmres(m, VOLUMEPLUSRAND); norm = sqrt(square_norm(Q, N, parallel)); assign(solver_field[2], P, N); for(restart = 0; restart < max_restarts; restart++){ /* r_0=Q-AP (b=Q, x+0=P) */ f(solver_field[0], solver_field[2]); diff(solver_field[0], Q, solver_field[0], N); /* v_0=r_0/||r_0|| */ alpha[0].re=sqrt(square_norm(solver_field[0], N, parallel)); if(g_proc_id == g_stdio_proc && g_debug_level > 1){ printf("%d\t%g true residue\n", restart*m, alpha[0].re*alpha[0].re); fflush(stdout); } if(alpha[0].re==0.){ assign(P, solver_field[2], N); finalize_solver(solver_field, nr_sf); return(restart*m); } mul_r(V[0], 1./alpha[0].re, solver_field[0], N); for(j = 0; j < m; j++){ /* solver_field[0]=A*v_j */ f(solver_field[0], V[j]); /* Set h_ij and omega_j */ /* solver_field[1] <- omega_j */ assign(solver_field[1], solver_field[0], N); for(i = 0; i <= j; i++){ H[i][j] = scalar_prod(V[i], solver_field[1], N, parallel); assign_diff_mul(solver_field[1], V[i], H[i][j], N); } _complex_set(H[j+1][j], sqrt(square_norm(solver_field[1], N, parallel)), 0.); for(i = 0; i < j; i++){ tmp1 = H[i][j]; tmp2 = H[i+1][j]; _mult_real(H[i][j], tmp2, s[i]); _add_assign_complex_conj(H[i][j], c[i], tmp1); _mult_real(H[i+1][j], tmp1, s[i]); _diff_assign_complex(H[i+1][j], c[i], tmp2); } /* Set beta, s, c, alpha[j],[j+1] */ beta = sqrt(_complex_square_norm(H[j][j]) + _complex_square_norm(H[j+1][j])); s[j] = H[j+1][j].re / beta; _mult_real(c[j], H[j][j], 1./beta); _complex_set(H[j][j], beta, 0.); _mult_real(alpha[j+1], alpha[j], s[j]); tmp1 = alpha[j]; _mult_assign_complex_conj(alpha[j], c[j], tmp1); /* precision reached? */ if(g_proc_id == g_stdio_proc && g_debug_level > 1){ printf("%d\t%g residue\n", restart*m+j, alpha[j+1].re*alpha[j+1].re); fflush(stdout); } if(((alpha[j+1].re <= eps) && (rel_prec == 0)) || ((alpha[j+1].re <= eps*norm) && (rel_prec == 1))){ _mult_real(alpha[j], alpha[j], 1./H[j][j].re); assign_add_mul(solver_field[2], V[j], alpha[j], N); for(i = j-1; i >= 0; i--){ for(k = i+1; k <= j; k++){ _mult_assign_complex(tmp1, H[i][k], alpha[k]); _diff_complex(alpha[i], tmp1); } _mult_real(alpha[i], alpha[i], 1./H[i][i].re); assign_add_mul(solver_field[2], V[i], alpha[i], N); } for(i = 0; i < m; i++){ alpha[i].im = 0.; } assign(P, solver_field[2], N); finalize_solver(solver_field, nr_sf); return(restart*m+j); } /* if not */ else{ if(j != m-1){ mul_r(V[(j+1)], 1./H[j+1][j].re, solver_field[1], N); } } } j=m-1; /* prepare for restart */ _mult_real(alpha[j], alpha[j], 1./H[j][j].re); assign_add_mul(solver_field[2], V[j], alpha[j], N); for(i = j-1; i >= 0; i--){ for(k = i+1; k <= j; k++){ _mult_assign_complex(tmp1, H[i][k], alpha[k]); _diff_complex(alpha[i], tmp1); } _mult_real(alpha[i], alpha[i], 1./H[i][i].re); assign_add_mul(solver_field[2], V[i], alpha[i], N); } for(i = 0; i < m; i++){ alpha[i].im = 0.; } } /* If maximal number of restarts is reached */ assign(P, solver_field[2], N); finalize_solver(solver_field, nr_sf); return(-1); }