// Apply the even-odd preconditioned Dirac operator void tm_matpc(void *outEven, void **gauge, void *inEven, double kappa, double mu, QudaTwistFlavorType flavor, QudaMatPCType matpc_type, int daggerBit, QudaPrecision precision, QudaGaugeParam &gauge_param) { void *tmp = malloc(Vh*spinorSiteSize*precision); if (matpc_type == QUDA_MATPC_EVEN_EVEN_ASYMMETRIC) { wil_dslash(tmp, gauge, inEven, 1, daggerBit, precision, gauge_param); twist_gamma5(tmp, tmp, daggerBit, kappa, mu, flavor, Vh, QUDA_TWIST_GAMMA5_INVERSE, precision); wil_dslash(outEven, gauge, tmp, 0, daggerBit, precision, gauge_param); twist_gamma5(tmp, inEven, daggerBit, kappa, mu, flavor, Vh, QUDA_TWIST_GAMMA5_DIRECT, precision); } else if (matpc_type == QUDA_MATPC_ODD_ODD_ASYMMETRIC) { wil_dslash(tmp, gauge, inEven, 0, daggerBit, precision, gauge_param); twist_gamma5(tmp, tmp, daggerBit, kappa, mu, flavor, Vh, QUDA_TWIST_GAMMA5_INVERSE, precision); wil_dslash(outEven, gauge, tmp, 1, daggerBit, precision, gauge_param); twist_gamma5(tmp, inEven, daggerBit, kappa, mu, flavor, Vh, QUDA_TWIST_GAMMA5_DIRECT, precision); } else if (!daggerBit) { if (matpc_type == QUDA_MATPC_EVEN_EVEN) { wil_dslash(tmp, gauge, inEven, 1, daggerBit, precision, gauge_param); twist_gamma5(tmp, tmp, daggerBit, kappa, mu, flavor, Vh, QUDA_TWIST_GAMMA5_INVERSE, precision); wil_dslash(outEven, gauge, tmp, 0, daggerBit, precision, gauge_param); twist_gamma5(outEven, outEven, daggerBit, kappa, mu, flavor, Vh, QUDA_TWIST_GAMMA5_INVERSE, precision); } else if (matpc_type == QUDA_MATPC_ODD_ODD) { wil_dslash(tmp, gauge, inEven, 0, daggerBit, precision, gauge_param); twist_gamma5(tmp, tmp, daggerBit, kappa, mu, flavor, Vh, QUDA_TWIST_GAMMA5_INVERSE, precision); wil_dslash(outEven, gauge, tmp, 1, daggerBit, precision, gauge_param); twist_gamma5(outEven, outEven, daggerBit, kappa, mu, flavor, Vh, QUDA_TWIST_GAMMA5_INVERSE, precision); } } else { if (matpc_type == QUDA_MATPC_EVEN_EVEN) { twist_gamma5(inEven, inEven, daggerBit, kappa, mu, flavor, Vh, QUDA_TWIST_GAMMA5_INVERSE, precision); wil_dslash(tmp, gauge, inEven, 1, daggerBit, precision, gauge_param); twist_gamma5(tmp, tmp, daggerBit, kappa, mu, flavor, Vh, QUDA_TWIST_GAMMA5_INVERSE, precision); wil_dslash(outEven, gauge, tmp, 0, daggerBit, precision, gauge_param); twist_gamma5(inEven, inEven, daggerBit, kappa, mu, flavor, Vh, QUDA_TWIST_GAMMA5_DIRECT, precision); } else if (matpc_type == QUDA_MATPC_ODD_ODD) { twist_gamma5(inEven, inEven, daggerBit, kappa, mu, flavor, Vh, QUDA_TWIST_GAMMA5_INVERSE, precision); wil_dslash(tmp, gauge, inEven, 0, daggerBit, precision, gauge_param); twist_gamma5(tmp, tmp, daggerBit, kappa, mu, flavor, Vh, QUDA_TWIST_GAMMA5_INVERSE, precision); wil_dslash(outEven, gauge, tmp, 1, daggerBit, precision, gauge_param); twist_gamma5(inEven, inEven, daggerBit, kappa, mu, flavor, Vh, QUDA_TWIST_GAMMA5_DIRECT, precision); // undo } } // lastly apply the kappa term double kappa2 = -kappa*kappa; if (matpc_type == QUDA_MATPC_EVEN_EVEN || matpc_type == QUDA_MATPC_ODD_ODD) { if (precision == QUDA_DOUBLE_PRECISION) xpay((double*)inEven, kappa2, (double*)outEven, Vh*spinorSiteSize); else xpay((float*)inEven, (float)kappa2, (float*)outEven, Vh*spinorSiteSize); } else { if (precision == QUDA_DOUBLE_PRECISION) xpay((double*)tmp, kappa2, (double*)outEven, Vh*spinorSiteSize); else xpay((float*)tmp, (float)kappa2, (float*)outEven, Vh*spinorSiteSize); } free(tmp); }
void wil_mat(void *out, void **gauge, void *in, double kappa, int dagger_bit, QudaPrecision precision, QudaGaugeParam &gauge_param) { void *inEven = in; void *inOdd = (char*)in + Vh*spinorSiteSize*precision; void *outEven = out; void *outOdd = (char*)out + Vh*spinorSiteSize*precision; wil_dslash(outOdd, gauge, inEven, 1, dagger_bit, precision, gauge_param); wil_dslash(outEven, gauge, inOdd, 0, dagger_bit, precision, gauge_param); // lastly apply the kappa term if (precision == QUDA_DOUBLE_PRECISION) xpay((double*)in, -kappa, (double*)out, V*spinorSiteSize); else xpay((float*)in, -(float)kappa, (float*)out, V*spinorSiteSize); }
/* * Solves lapl(u) x = b, for x, given b, using Conjugate Gradient */ void cg(latparams lp, field **x, field **b, link **g) { size_t L = lp.L; int max_iter = 100; float tol = 1e-9; /* Temporary fields needed for CG */ field **r = new_field(lp); field **p = new_field(lp); field **Ap = new_field(lp); /* Initial residual and p-vector */ lapl(lp, r, x, g); xmy(lp, b, r); xeqy(lp, p, r); /* Initial r-norm and b-norm */ float rr = xdotx(lp, r); float bb = xdotx(lp, b); double t_lapl = 0; int iter = 0; for(iter=0; iter<max_iter; iter++) { printf(" %6d, res = %+e\n", iter, rr/bb); if(sqrt(rr/bb) < tol) break; double t = stop_watch(0); lapl(lp, Ap, p, g); t_lapl += stop_watch(t); float pAp = xdoty(lp, p, Ap); float alpha = rr/pAp; axpy(lp, alpha, p, x); axpy(lp, -alpha, Ap, r); float r1r1 = xdotx(lp, r); float beta = r1r1/rr; xpay(lp, r, beta, p); rr = r1r1; } /* Recompute residual after convergence */ lapl(lp, r, x, g); xmy(lp, b, r); rr = xdotx(lp, r); double beta_fp = 50*((double)L*L*L)/(t_lapl/(double)iter)*1e-9; double beta_io = 40*((double)L*L*L)/(t_lapl/(double)iter)*1e-9; printf(" Converged after %6d iterations, res = %+e\n", iter, rr/bb); printf(" Time in lapl(): %+6.3e sec/call, %4.2e Gflop/s, %4.2e GB/s\n", t_lapl/(double)iter, beta_fp, beta_io); del_field(r); del_field(p); del_field(Ap); return; }
// Apply the even-odd preconditioned Dirac operator void MatPCDag(float *outEven, float **gauge, float *inEven, float kappa) { float *tmpOdd = (float*)malloc(Nh*spinorSiteSize*sizeof(float)); // full dslash operator dslashReference(tmpOdd, gauge, inEven, 1, 1); dslashReference(outEven, gauge, tmpOdd, 0, 1); float kappa2 = -kappa*kappa; xpay(inEven, kappa2, outEven, Nh*spinorSiteSize); free(tmpOdd); }
/* * Solves lapl(u) x = b, for x, given b, using Conjugate Gradient */ void cg(size_t L, _Complex float *x, _Complex float *b, _Complex float *u) { int max_iter = 100; float tol = 1e-6; /* Temporary fields needed for CG */ _Complex float *r = new_field(L); _Complex float *p = new_field(L); _Complex float *Ap = new_field(L); /* Initial residual and p-vector */ lapl(L, r, x, u); xmy(L, b, r); xeqy(L, p, r); /* Initial r-norm and b-norm */ float rr = xdotx(L, r); float bb = xdotx(L, b); double t_lapl = 0; int iter = 0; for(iter=0; iter<max_iter; iter++) { printf(" %6d, res = %+e\n", iter, rr/bb); if(sqrt(rr/bb) < tol) break; double t = stop_watch(0); lapl(L, Ap, p, u); t_lapl += stop_watch(t); float pAp = xdoty(L, p, Ap); float alpha = rr/pAp; axpy(L, alpha, p, x); axpy(L, -alpha, Ap, r); float r1r1 = xdotx(L, r); float beta = r1r1/rr; xpay(L, r, beta, p); rr = r1r1; } /* Recompute residual after convergence */ lapl(L, r, x, u); xmy(L, b, r); rr = xdotx(L, r); double beta_fp = 34*L*L/(t_lapl/(double)iter)*1e-9; double beta_io = 32*L*L/(t_lapl/(double)iter)*1e-9; printf(" Converged after %6d iterations, res = %+e\n", iter, rr/bb); printf(" Time in lapl(): %+6.3e sec/call, %4.2e Gflop/s, %4.2e GB/s\n", t_lapl/(double)iter, beta_fp, beta_io); free(r); free(p); free(Ap); return; }
void MatDag(float *out, float **gauge, float *in, float kappa) { float *inEven = in; float *inOdd = in + Nh*spinorSiteSize; float *outEven = out; float *outOdd = out + Nh*spinorSiteSize; // full dslash operator dslashReference(outOdd, gauge, inEven, 1, 1); dslashReference(outEven, gauge, inOdd, 0, 1); // lastly apply the kappa term xpay(in, -kappa, out, N*spinorSiteSize); }
void tm_mat(void *out, void **gauge, void *in, double kappa, double mu, QudaTwistFlavorType flavor, int dagger_bit, QudaPrecision precision, QudaGaugeParam &gauge_param) { void *inEven = in; void *inOdd = (char*)in + Vh*spinorSiteSize*precision; void *outEven = out; void *outOdd = (char*)out + Vh*spinorSiteSize*precision; void *tmp = malloc(V*spinorSiteSize*precision); wil_dslash(outOdd, gauge, inEven, 1, dagger_bit, precision, gauge_param); wil_dslash(outEven, gauge, inOdd, 0, dagger_bit, precision, gauge_param); // apply the twist term to the full lattice twist_gamma5(tmp, in, dagger_bit, kappa, mu, flavor, V, QUDA_TWIST_GAMMA5_DIRECT, precision); // combine if (precision == QUDA_DOUBLE_PRECISION) xpay((double*)tmp, -kappa, (double*)out, V*spinorSiteSize); else xpay((float*)tmp, -(float)kappa, (float*)out, V*spinorSiteSize); free(tmp); }
// Apply the even-odd preconditioned Dirac operator void wil_matpc(void *outEven, void **gauge, void *inEven, double kappa, QudaMatPCType matpc_type, int daggerBit, QudaPrecision precision, QudaGaugeParam &gauge_param) { void *tmp = malloc(Vh*spinorSiteSize*precision); // FIXME: remove once reference clover is finished // full dslash operator if (matpc_type == QUDA_MATPC_EVEN_EVEN || matpc_type == QUDA_MATPC_EVEN_EVEN_ASYMMETRIC) { wil_dslash(tmp, gauge, inEven, 1, daggerBit, precision, gauge_param); wil_dslash(outEven, gauge, tmp, 0, daggerBit, precision, gauge_param); } else { wil_dslash(tmp, gauge, inEven, 0, daggerBit, precision, gauge_param); wil_dslash(outEven, gauge, tmp, 1, daggerBit, precision, gauge_param); } // lastly apply the kappa term double kappa2 = -kappa*kappa; if (precision == QUDA_DOUBLE_PRECISION) xpay((double*)inEven, kappa2, (double*)outEven, Vh*spinorSiteSize); else xpay((float*)inEven, (float)kappa2, (float*)outEven, Vh*spinorSiteSize); free(tmp); }
/*************************************** * Conjugate Gradient * * This function will do the CG * * algorithm without preconditioning. * * For optimiziation you must not * * change the algorithm. * *************************************** r(0) = b - Ax(0) p(0) = r(0) rho(0) = <r(0),r(0)> *************************************** for k=0,1,2,...,n-1 q(k) = A * p(k) dot_pq = <p(k),q(k)> alpha = rho(k) / dot_pq x(k+1) = x(k) + alpha*p(k) r(k+1) = r(k) - alpha*q(k) check convergence ||r(k+1)||_2 < eps rho(k+1) = <r(k+1), r(k+1)> beta = rho(k+1) / rho(k) p(k+1) = r(k+1) + beta*p(k) ***************************************/ void cg(const int n, const int nnz, const int maxNNZ, const floatType* data, const int* indices, const int* length, const floatType* b, floatType* x, struct SolverConfig* sc){ floatType* r, *p, *q; floatType alpha, beta, rho, rho_old, dot_pq, bnrm2; int iter; double timeMatvec_s; double timeMatvec=0; int i; floatType temp; /* allocate memory */ r = (floatType*)malloc(n * sizeof(floatType)); p = (floatType*)malloc(n * sizeof(floatType)); q = (floatType*)malloc(n * sizeof(floatType)); #pragma acc data copyin(data[0:n*maxNNZ], indices[0:n*maxNNZ], length[0:n], n, nnz, maxNNZ, b[0:n]) copy(x[0:n]) create(alpha, beta, r[0:n], p[0:n], q[0:n], i, temp) //eigentlich auch copy(x[0:n]) aber error: not found on device??? { DBGMAT("Start matrix A = ", n, nnz, maxNNZ, data, indices, length) DBGVEC("b = ", b, n); DBGVEC("x = ", x, n); /* r(0) = b - Ax(0) */ timeMatvec_s = getWTime(); matvec(n, nnz, maxNNZ, data, indices, length, x, r); //hier inline ausprobieren /*int i, j, k; #pragma acc parallel loop present(data, indices, length, x) for (i = 0; i < n; i++) { r[i] = 0; for (j = 0; j < length[i]; j++) { k = j * n + i; r[i] += data[k] * x[indices[k]]; } }*/ timeMatvec += getWTime() - timeMatvec_s; xpay(b, -1.0, n, r); DBGVEC("r = b - Ax = ", r, n); /* Calculate initial residuum */ nrm2(r, n, &bnrm2); bnrm2 = 1.0 /bnrm2; /* p(0) = r(0) */ memcpy(p, r, n*sizeof(floatType)); DBGVEC("p = r = ", p, n); /* rho(0) = <r(0),r(0)> */ vectorDot(r, r, n, &rho); printf("rho_0=%e\n", rho); for(iter = 0; iter < sc->maxIter; iter++){ DBGMSG("=============== Iteration %d ======================\n", iter); /* q(k) = A * p(k) */ timeMatvec_s = getWTime(); matvec(n, nnz, maxNNZ, data, indices, length, p, q); timeMatvec += getWTime() - timeMatvec_s; DBGVEC("q = A * p= ", q, n); /* dot_pq = <p(k),q(k)> */ vectorDot(p, q, n, &dot_pq); DBGSCA("dot_pq = <p, q> = ", dot_pq); /* alpha = rho(k) / dot_pq */ alpha = rho / dot_pq; DBGSCA("alpha = rho / dot_pq = ", alpha); /* x(k+1) = x(k) + alpha*p(k) */ axpy(alpha, p, n, x); #pragma acc update host(x[0:n]) DBGVEC("x = x + alpha * p= ", x, n); /* r(k+1) = r(k) - alpha*q(k) */ axpy(-alpha, q, n, r); DBGVEC("r = r - alpha * q= ", r, n); rho_old = rho; DBGSCA("rho_old = rho = ", rho_old); /* rho(k+1) = <r(k+1), r(k+1)> */ vectorDot(r, r, n, &rho); DBGSCA("rho = <r, r> = ", rho); /* Normalize the residual with initial one */ sc->residual= sqrt(rho) * bnrm2; /* Check convergence ||r(k+1)||_2 < eps * If the residual is smaller than the CG * tolerance specified in the CG_TOLERANCE * environment variable our solution vector * is good enough and we can stop the * algorithm. */ printf("res_%d=%e\n", iter+1, sc->residual); if(sc->residual <= sc->tolerance) break; /* beta = rho(k+1) / rho(k) */ beta = rho / rho_old; DBGSCA("beta = rho / rho_old= ", beta); /* p(k+1) = r(k+1) + beta*p(k) */ xpay(r, beta, n, p); DBGVEC("p = r + beta * p> = ", p, n); } /* Store the number of iterations and the * time for the sparse matrix vector * product which is the most expensive * function in the whole CG algorithm. */ sc->iter = iter; sc->timeMatvec = timeMatvec; /* Clean up */ free(r); free(p); free(q); }//ende data region }
void cgTest() { float mass = 0.01; float kappa = 1.0 / (2.0*(4 + mass)); float *gauge[4]; for (int dir = 0; dir < 4; dir++) { gauge[dir] = (float*)malloc(N*gaugeSiteSize*sizeof(float)); } //constructGaugeField(gauge); constructUnitGaugeField(gauge); float *spinorIn = (float*)malloc(N*spinorSiteSize*sizeof(float)); float *spinorOut = (float*)malloc(N*spinorSiteSize*sizeof(float)); #ifdef EVEN_ODD float *source = (float *)malloc(Nh*spinorSiteSize*sizeof(float)); float *tmp = (float *)malloc(Nh*spinorSiteSize*sizeof(float)); #else float *source = (float *)malloc(N*spinorSiteSize*sizeof(float)); #endif int i0 = 0; int s0 = 0; int c0 = 0; constructPointSpinorField(spinorIn, i0, s0, c0); //constructSpinorField(spinorIn); // Prepare the source term ax(2*kappa, spinorIn, N*spinorSiteSize); // see output element // Mat(source, gauge, spinorIn, kappa); // printSpinorElement(source, 0); #ifdef EVEN_ODD float *spinorInOdd = spinorIn + Nh*spinorSiteSize; dslashReference(tmp, gauge, spinorInOdd, 0, 0); xpay(spinorIn, kappa, tmp, Nh*spinorSiteSize); MatPCDag(source, gauge, tmp, kappa); #else MatDag(source, gauge, spinorIn, kappa); #endif cgCuda(spinorOut, gauge, source, kappa, 1e-7); // cg_reference(spinorOut, gauge, source, kappa, 1e-7); // Reconstruct the full inverse #ifdef EVEN_ODD float *spinorOutOdd = spinorOut + Nh*spinorSiteSize; dslashReference(spinorOutOdd, gauge, spinorOut, 1, 0); xpay(spinorInOdd, kappa, spinorOutOdd, Nh*spinorSiteSize); #endif printf("Result norm = %e\n", norm(spinorOut, N*spinorSiteSize)); // release memory for (int dir = 0; dir < 4; dir++) free(gauge[dir]); free(source); #ifdef EVEN_ODD free(tmp); #endif free(spinorIn); free(spinorOut); }
/* BiCGSTAB(L) algorithm for the n-by-n problem Ax = b */ ptrdiff_t bicgstabL(const int L, const size_t n, realnum *x, bicgstab_op A, void *Adata, const realnum *b, const double tol, int *iters, realnum *work, const bool quiet) { if (!work) return (2 * L + 3) * n; // required workspace prealnum *r = new prealnum[L + 1]; prealnum *u = new prealnum[L + 1]; for (int i = 0; i <= L; ++i) { r[i] = work + i * n; u[i] = work + (L + 1 + i) * n; } double bnrm = norm2(n, b); if (bnrm == 0.0) bnrm = 1.0; int iter = 0; double last_output_wall_time = wall_time(); double *gamma = new double[L + 1]; double *gamma_p = new double[L + 1]; double *gamma_pp = new double[L + 1]; double *tau = new double[L * L]; double *sigma = new double[L + 1]; int ierr = 0; // error code to return, if any const double breaktol = 1e-30; /**** FIXME: check for breakdown conditions(?) during iteration ****/ // rtilde = r[0] = b - Ax realnum *rtilde = work + (2 * L + 2) * n; A(x, r[0], Adata); for (size_t m = 0; m < n; ++m) rtilde[m] = r[0][m] = b[m] - r[0][m]; { /* Sleipjen normalizes rtilde in his code; it seems to help slightly */ double s = 1.0 / norm2(n, rtilde); for (size_t m = 0; m < n; ++m) rtilde[m] *= s; } memset(u[0], 0, sizeof(realnum) * n); // u[0] = 0 double rho = 1.0, alpha = 0, omega = 1; double resid; while ((resid = norm2(n, r[0])) > tol * bnrm) { ++iter; if (!quiet && wall_time() > last_output_wall_time + MEEP_MIN_OUTPUT_TIME) { master_printf("residual[%d] = %g\n", iter, resid / bnrm); last_output_wall_time = wall_time(); } rho = -omega * rho; for (int j = 0; j < L; ++j) { if (fabs(rho) < breaktol) { ierr = -1; goto finish; } double rho1 = dot(n, r[j], rtilde); double beta = alpha * rho1 / rho; rho = rho1; for (int i = 0; i <= j; ++i) for (size_t m = 0; m < n; ++m) u[i][m] = r[i][m] - beta * u[i][m]; A(u[j], u[j + 1], Adata); alpha = rho / dot(n, u[j + 1], rtilde); for (int i = 0; i <= j; ++i) xpay(n, r[i], -alpha, u[i + 1]); A(r[j], r[j + 1], Adata); xpay(n, x, alpha, u[0]); } for (int j = 1; j <= L; ++j) { for (int i = 1; i < j; ++i) { int ij = (j - 1) * L + (i - 1); tau[ij] = dot(n, r[j], r[i]) / sigma[i]; xpay(n, r[j], -tau[ij], r[i]); } sigma[j] = dot(n, r[j], r[j]); gamma_p[j] = dot(n, r[0], r[j]) / sigma[j]; } omega = gamma[L] = gamma_p[L]; for (int j = L - 1; j >= 1; --j) { gamma[j] = gamma_p[j]; for (int i = j + 1; i <= L; ++i) gamma[j] -= tau[(i - 1) * L + (j - 1)] * gamma[i]; } for (int j = 1; j < L; ++j) { gamma_pp[j] = gamma[j + 1]; for (int i = j + 1; i < L; ++i) gamma_pp[j] += tau[(i - 1) * L + (j - 1)] * gamma[i + 1]; } xpay(n, x, gamma[1], r[0]); xpay(n, r[0], -gamma_p[L], r[L]); xpay(n, u[0], -gamma[L], u[L]); for (int j = 1; j < L; ++j) { /* TODO: use blas DGEMV (for L > 2) */ xpay(n, x, gamma_pp[j], r[j]); xpay(n, r[0], -gamma_p[j], r[j]); xpay(n, u[0], -gamma[j], u[j]); } if (iter == *iters) { ierr = 1; break; } } if (!quiet) master_printf("final residual = %g\n", norm2(n, r[0]) / bnrm); finish: delete[] sigma; delete[] tau; delete[] gamma_pp; delete[] gamma_p; delete[] gamma; delete[] u; delete[] r; *iters = iter; return ierr; }