void DiracStaggered::checkParitySpinor(const cudaColorSpinorField &in, const cudaColorSpinorField &out) const { if (in.Precision() != out.Precision()) { errorQuda("Input and output spinor precisions don't match in dslash_quda"); } if (in.Stride() != out.Stride()) { errorQuda("Input %d and output %d spinor strides don't match in dslash_quda", in.Stride(), out.Stride()); } if (in.SiteSubset() != QUDA_PARITY_SITE_SUBSET || out.SiteSubset() != QUDA_PARITY_SITE_SUBSET) { errorQuda("ColorSpinorFields are not single parity, in = %d, out = %d", in.SiteSubset(), out.SiteSubset()); } }
void DiracStaggered::checkParitySpinor(const cudaColorSpinorField &in, const cudaColorSpinorField &out) const { if (in.Precision() != out.Precision()) { errorQuda("Input and output spinor precisions don't match in dslash_quda"); } if (in.Stride() != out.Stride()) { errorQuda("Input %d and output %d spinor strides don't match in dslash_quda", in.Stride(), out.Stride()); } if (in.SiteSubset() != QUDA_PARITY_SITE_SUBSET || out.SiteSubset() != QUDA_PARITY_SITE_SUBSET) { errorQuda("ColorSpinorFields are not single parity, in = %d, out = %d", in.SiteSubset(), out.SiteSubset()); } if ((out.Volume() != 2*fatGauge->VolumeCB() && out.SiteSubset() == QUDA_FULL_SITE_SUBSET) || (out.Volume() != fatGauge->VolumeCB() && out.SiteSubset() == QUDA_PARITY_SITE_SUBSET) ) { errorQuda("Spinor volume %d doesn't match gauge volume %d", out.Volume(), fatGauge->VolumeCB()); } }
void Dirac::checkParitySpinor(const cudaColorSpinorField &out, const cudaColorSpinorField &in) const { if (in.GammaBasis() != QUDA_UKQCD_GAMMA_BASIS || out.GammaBasis() != QUDA_UKQCD_GAMMA_BASIS) { errorQuda("CUDA Dirac operator requires UKQCD basis, out = %d, in = %d", out.GammaBasis(), in.GammaBasis()); } if (in.Precision() != out.Precision()) { errorQuda("Input precision %d and output spinor precision %d don't match in dslash_quda", in.Precision(), out.Precision()); } if (in.Stride() != out.Stride()) { errorQuda("Input %d and output %d spinor strides don't match in dslash_quda", in.Stride(), out.Stride()); } if (in.SiteSubset() != QUDA_PARITY_SITE_SUBSET || out.SiteSubset() != QUDA_PARITY_SITE_SUBSET) { errorQuda("ColorSpinorFields are not single parity: in = %d, out = %d", in.SiteSubset(), out.SiteSubset()); } if (out.Ndim() != 5) { if ((out.Volume() != gauge.Volume() && out.SiteSubset() == QUDA_FULL_SITE_SUBSET) || (out.Volume() != gauge.VolumeCB() && out.SiteSubset() == QUDA_PARITY_SITE_SUBSET) ) { errorQuda("Spinor volume %d doesn't match gauge volume %d", out.Volume(), gauge.VolumeCB()); } } else { // Domain wall fermions, compare 4d volumes not 5d if ((out.Volume()/out.X(4) != gauge.Volume() && out.SiteSubset() == QUDA_FULL_SITE_SUBSET) || (out.Volume()/out.X(4) != gauge.VolumeCB() && out.SiteSubset() == QUDA_PARITY_SITE_SUBSET) ) { errorQuda("Spinor volume %d doesn't match gauge volume %d", out.Volume(), gauge.VolumeCB()); } } }
void CG::operator()(cudaColorSpinorField &x, cudaColorSpinorField &b) { int k=0; int rUpdate = 0; cudaColorSpinorField r(b); ColorSpinorParam param(x); param.create = QUDA_ZERO_FIELD_CREATE; cudaColorSpinorField y(b, param); mat(r, x, y); zeroCuda(y); double r2 = xmyNormCuda(b, r); rUpdate ++; param.precision = invParam.cuda_prec_sloppy; cudaColorSpinorField Ap(x, param); cudaColorSpinorField tmp(x, param); cudaColorSpinorField tmp2(x, param); // only needed for clover and twisted mass cudaColorSpinorField *x_sloppy, *r_sloppy; if (invParam.cuda_prec_sloppy == x.Precision()) { param.create = QUDA_REFERENCE_FIELD_CREATE; x_sloppy = &x; r_sloppy = &r; } else { param.create = QUDA_COPY_FIELD_CREATE; x_sloppy = new cudaColorSpinorField(x, param); r_sloppy = new cudaColorSpinorField(r, param); } cudaColorSpinorField &xSloppy = *x_sloppy; cudaColorSpinorField &rSloppy = *r_sloppy; cudaColorSpinorField p(rSloppy); double r2_old; double src_norm = norm2(b); double stop = src_norm*invParam.tol*invParam.tol; // stopping condition of solver double alpha, beta; double pAp; double rNorm = sqrt(r2); double r0Norm = rNorm; double maxrx = rNorm; double maxrr = rNorm; double delta = invParam.reliable_delta; if (invParam.verbosity >= QUDA_VERBOSE) printfQuda("CG: %d iterations, r2 = %e\n", k, r2); quda::blas_flops = 0; stopwatchStart(); while (r2 > stop && k<invParam.maxiter) { matSloppy(Ap, p, tmp, tmp2); // tmp as tmp pAp = reDotProductCuda(p, Ap); alpha = r2 / pAp; r2_old = r2; r2 = axpyNormCuda(-alpha, Ap, rSloppy); // reliable update conditions rNorm = sqrt(r2); if (rNorm > maxrx) maxrx = rNorm; if (rNorm > maxrr) maxrr = rNorm; int updateX = (rNorm < delta*r0Norm && r0Norm <= maxrx) ? 1 : 0; int updateR = ((rNorm < delta*maxrr && r0Norm <= maxrr) || updateX) ? 1 : 0; if (!(updateR || updateX)) { beta = r2 / r2_old; axpyZpbxCuda(alpha, p, xSloppy, rSloppy, beta); } else { axpyCuda(alpha, p, xSloppy); if (x.Precision() != xSloppy.Precision()) copyCuda(x, xSloppy); xpyCuda(x, y); // swap these around? mat(r, y, x); // here we can use x as tmp r2 = xmyNormCuda(b, r); if (x.Precision() != rSloppy.Precision()) copyCuda(rSloppy, r); zeroCuda(xSloppy); rNorm = sqrt(r2); maxrr = rNorm; maxrx = rNorm; r0Norm = rNorm; rUpdate++; beta = r2 / r2_old; xpayCuda(rSloppy, beta, p); } k++; if (invParam.verbosity >= QUDA_VERBOSE) printfQuda("CG: %d iterations, r2 = %e\n", k, r2); } if (x.Precision() != xSloppy.Precision()) copyCuda(x, xSloppy); xpyCuda(y, x); invParam.secs = stopwatchReadSeconds(); if (k==invParam.maxiter) warningQuda("Exceeded maximum iterations %d", invParam.maxiter); if (invParam.verbosity >= QUDA_SUMMARIZE) printfQuda("CG: Reliable updates = %d\n", rUpdate); double gflops = (quda::blas_flops + mat.flops() + matSloppy.flops())*1e-9; reduceDouble(gflops); // printfQuda("%f gflops\n", gflops / stopwatchReadSeconds()); invParam.gflops = gflops; invParam.iter = k; quda::blas_flops = 0; if (invParam.verbosity >= QUDA_SUMMARIZE){ mat(r, x, y); double true_res = xmyNormCuda(b, r); printfQuda("CG: Converged after %d iterations, relative residua: iterated = %e, true = %e\n", k, sqrt(r2/src_norm), sqrt(true_res / src_norm)); } if (invParam.cuda_prec_sloppy != x.Precision()) { delete r_sloppy; delete x_sloppy; } return; }
void CG::operator()(cudaColorSpinorField &x, cudaColorSpinorField &b) { profile.Start(QUDA_PROFILE_INIT); // Check to see that we're not trying to invert on a zero-field source const double b2 = norm2(b); if(b2 == 0){ profile.Stop(QUDA_PROFILE_INIT); printfQuda("Warning: inverting on zero-field source\n"); x=b; param.true_res = 0.0; param.true_res_hq = 0.0; return; } cudaColorSpinorField r(b); ColorSpinorParam csParam(x); csParam.create = QUDA_ZERO_FIELD_CREATE; cudaColorSpinorField y(b, csParam); mat(r, x, y); // zeroCuda(y); double r2 = xmyNormCuda(b, r); csParam.setPrecision(param.precision_sloppy); cudaColorSpinorField Ap(x, csParam); cudaColorSpinorField tmp(x, csParam); cudaColorSpinorField *tmp2_p = &tmp; // tmp only needed for multi-gpu Wilson-like kernels if (mat.Type() != typeid(DiracStaggeredPC).name() && mat.Type() != typeid(DiracStaggered).name()) { tmp2_p = new cudaColorSpinorField(x, csParam); } cudaColorSpinorField &tmp2 = *tmp2_p; cudaColorSpinorField *x_sloppy, *r_sloppy; if (param.precision_sloppy == x.Precision()) { csParam.create = QUDA_REFERENCE_FIELD_CREATE; x_sloppy = &x; r_sloppy = &r; } else { csParam.create = QUDA_COPY_FIELD_CREATE; x_sloppy = new cudaColorSpinorField(x, csParam); r_sloppy = new cudaColorSpinorField(r, csParam); } cudaColorSpinorField &xSloppy = *x_sloppy; cudaColorSpinorField &rSloppy = *r_sloppy; cudaColorSpinorField p(rSloppy); if(&x != &xSloppy){ copyCuda(y,x); zeroCuda(xSloppy); }else{ zeroCuda(y); } const bool use_heavy_quark_res = (param.residual_type & QUDA_HEAVY_QUARK_RESIDUAL) ? true : false; profile.Stop(QUDA_PROFILE_INIT); profile.Start(QUDA_PROFILE_PREAMBLE); double r2_old; double stop = b2*param.tol*param.tol; // stopping condition of solver double heavy_quark_res = 0.0; // heavy quark residual if(use_heavy_quark_res) heavy_quark_res = sqrt(HeavyQuarkResidualNormCuda(x,r).z); int heavy_quark_check = 10; // how often to check the heavy quark residual double alpha=0.0, beta=0.0; double pAp; int rUpdate = 0; double rNorm = sqrt(r2); double r0Norm = rNorm; double maxrx = rNorm; double maxrr = rNorm; double delta = param.delta; // this parameter determines how many consective reliable update // reisudal increases we tolerate before terminating the solver, // i.e., how long do we want to keep trying to converge int maxResIncrease = 0; // 0 means we have no tolerance profile.Stop(QUDA_PROFILE_PREAMBLE); profile.Start(QUDA_PROFILE_COMPUTE); blas_flops = 0; int k=0; PrintStats("CG", k, r2, b2, heavy_quark_res); int steps_since_reliable = 1; while ( !convergence(r2, heavy_quark_res, stop, param.tol_hq) && k < param.maxiter) { matSloppy(Ap, p, tmp, tmp2); // tmp as tmp double sigma; bool breakdown = false; if (param.pipeline) { double3 triplet = tripleCGReductionCuda(rSloppy, Ap, p); r2 = triplet.x; double Ap2 = triplet.y; pAp = triplet.z; r2_old = r2; alpha = r2 / pAp; sigma = alpha*(alpha * Ap2 - pAp); if (sigma < 0.0 || steps_since_reliable==0) { // sigma condition has broken down r2 = axpyNormCuda(-alpha, Ap, rSloppy); sigma = r2; breakdown = true; } r2 = sigma; } else { r2_old = r2; pAp = reDotProductCuda(p, Ap); alpha = r2 / pAp; // here we are deploying the alternative beta computation Complex cg_norm = axpyCGNormCuda(-alpha, Ap, rSloppy); r2 = real(cg_norm); // (r_new, r_new) sigma = imag(cg_norm) >= 0.0 ? imag(cg_norm) : r2; // use r2 if (r_k+1, r_k+1-r_k) breaks } // reliable update conditions rNorm = sqrt(r2); if (rNorm > maxrx) maxrx = rNorm; if (rNorm > maxrr) maxrr = rNorm; int updateX = (rNorm < delta*r0Norm && r0Norm <= maxrx) ? 1 : 0; int updateR = ((rNorm < delta*maxrr && r0Norm <= maxrr) || updateX) ? 1 : 0; // force a reliable update if we are within target tolerance (only if doing reliable updates) if ( convergence(r2, heavy_quark_res, stop, param.tol_hq) && delta >= param.tol) updateX = 1; if ( !(updateR || updateX)) { //beta = r2 / r2_old; beta = sigma / r2_old; // use the alternative beta computation if (param.pipeline && !breakdown) tripleCGUpdateCuda(alpha, beta, Ap, rSloppy, xSloppy, p); else axpyZpbxCuda(alpha, p, xSloppy, rSloppy, beta); if (use_heavy_quark_res && k%heavy_quark_check==0) { copyCuda(tmp,y); heavy_quark_res = sqrt(xpyHeavyQuarkResidualNormCuda(xSloppy, tmp, rSloppy).z); } steps_since_reliable++; } else { axpyCuda(alpha, p, xSloppy); if (x.Precision() != xSloppy.Precision()) copyCuda(x, xSloppy); xpyCuda(x, y); // swap these around? mat(r, y, x); // here we can use x as tmp r2 = xmyNormCuda(b, r); if (x.Precision() != rSloppy.Precision()) copyCuda(rSloppy, r); zeroCuda(xSloppy); // break-out check if we have reached the limit of the precision static int resIncrease = 0; if (sqrt(r2) > r0Norm && updateX) { // reuse r0Norm for this warningQuda("CG: new reliable residual norm %e is greater than previous reliable residual norm %e", sqrt(r2), r0Norm); k++; rUpdate++; if (++resIncrease > maxResIncrease) break; } else { resIncrease = 0; } rNorm = sqrt(r2); maxrr = rNorm; maxrx = rNorm; r0Norm = rNorm; rUpdate++; // explicitly restore the orthogonality of the gradient vector double rp = reDotProductCuda(rSloppy, p) / (r2); axpyCuda(-rp, rSloppy, p); beta = r2 / r2_old; xpayCuda(rSloppy, beta, p); if(use_heavy_quark_res) heavy_quark_res = sqrt(HeavyQuarkResidualNormCuda(y,r).z); steps_since_reliable = 0; } breakdown = false; k++; PrintStats("CG", k, r2, b2, heavy_quark_res); } if (x.Precision() != xSloppy.Precision()) copyCuda(x, xSloppy); xpyCuda(y, x); profile.Stop(QUDA_PROFILE_COMPUTE); profile.Start(QUDA_PROFILE_EPILOGUE); param.secs = profile.Last(QUDA_PROFILE_COMPUTE); double gflops = (quda::blas_flops + mat.flops() + matSloppy.flops())*1e-9; reduceDouble(gflops); param.gflops = gflops; param.iter += k; if (k==param.maxiter) warningQuda("Exceeded maximum iterations %d", param.maxiter); if (getVerbosity() >= QUDA_VERBOSE) printfQuda("CG: Reliable updates = %d\n", rUpdate); // compute the true residuals mat(r, x, y); param.true_res = sqrt(xmyNormCuda(b, r) / b2); #if (__COMPUTE_CAPABILITY__ >= 200) param.true_res_hq = sqrt(HeavyQuarkResidualNormCuda(x,r).z); #else param.true_res_hq = 0.0; #endif PrintSummary("CG", k, r2, b2); // reset the flops counters quda::blas_flops = 0; mat.flops(); matSloppy.flops(); profile.Stop(QUDA_PROFILE_EPILOGUE); profile.Start(QUDA_PROFILE_FREE); if (&tmp2 != &tmp) delete tmp2_p; if (param.precision_sloppy != x.Precision()) { delete r_sloppy; delete x_sloppy; } profile.Stop(QUDA_PROFILE_FREE); return; }
void PreconCG::operator()(cudaColorSpinorField &x, cudaColorSpinorField &b) { profile.Start(QUDA_PROFILE_INIT); // Check to see that we're not trying to invert on a zero-field source const double b2 = norm2(b); if(b2 == 0){ profile.Stop(QUDA_PROFILE_INIT); printfQuda("Warning: inverting on zero-field source\n"); x=b; param.true_res = 0.0; param.true_res_hq = 0.0; } int k=0; int rUpdate=0; cudaColorSpinorField* minvrPre; cudaColorSpinorField* rPre; cudaColorSpinorField* minvr; cudaColorSpinorField* minvrSloppy; cudaColorSpinorField* p; ColorSpinorParam csParam(b); cudaColorSpinorField r(b); if(K) minvr = new cudaColorSpinorField(b); csParam.create = QUDA_ZERO_FIELD_CREATE; cudaColorSpinorField y(b,csParam); mat(r, x, y); // => r = A*x; double r2 = xmyNormCuda(b,r); csParam.setPrecision(param.precision_sloppy); cudaColorSpinorField tmpSloppy(x,csParam); cudaColorSpinorField Ap(x,csParam); cudaColorSpinorField *r_sloppy; if(param.precision_sloppy == x.Precision()) { r_sloppy = &r; minvrSloppy = minvr; }else{ csParam.create = QUDA_COPY_FIELD_CREATE; r_sloppy = new cudaColorSpinorField(r,csParam); if(K) minvrSloppy = new cudaColorSpinorField(*minvr,csParam); } cudaColorSpinorField *x_sloppy; if(param.precision_sloppy == x.Precision() || !param.use_sloppy_partial_accumulator) { csParam.create = QUDA_REFERENCE_FIELD_CREATE; x_sloppy = &x; }else{ csParam.create = QUDA_COPY_FIELD_CREATE; x_sloppy = new cudaColorSpinorField(x,csParam); } cudaColorSpinorField &xSloppy = *x_sloppy; cudaColorSpinorField &rSloppy = *r_sloppy; if(&x != &xSloppy){ copyCuda(y, x); // copy x to y zeroCuda(xSloppy); }else{ zeroCuda(y); // no reliable updates // NB: check this } const bool use_heavy_quark_res = (param.residual_type & QUDA_HEAVY_QUARK_RESIDUAL) ? true : false; if(K){ csParam.create = QUDA_COPY_FIELD_CREATE; csParam.setPrecision(param.precision_precondition); rPre = new cudaColorSpinorField(rSloppy,csParam); // Create minvrPre minvrPre = new cudaColorSpinorField(*rPre); globalReduce = false; (*K)(*minvrPre, *rPre); globalReduce = true; *minvrSloppy = *minvrPre; p = new cudaColorSpinorField(*minvrSloppy); }else{ p = new cudaColorSpinorField(rSloppy); } profile.Stop(QUDA_PROFILE_INIT); profile.Start(QUDA_PROFILE_PREAMBLE); double stop = stopping(param.tol, b2, param.residual_type); // stopping condition of solver double heavy_quark_res = 0.0; // heavy quark residual if(use_heavy_quark_res) heavy_quark_res = sqrt(HeavyQuarkResidualNormCuda(x,r).z); int heavy_quark_check = 10; // how often to check the heavy quark residual double alpha = 0.0, beta=0.0; double pAp; double rMinvr = 0; double rMinvr_old = 0.0; double r_new_Minvr_old = 0.0; double r2_old = 0; r2 = norm2(r); double rNorm = sqrt(r2); double r0Norm = rNorm; double maxrx = rNorm; double maxrr = rNorm; double delta = param.delta; if(K) rMinvr = reDotProductCuda(rSloppy,*minvrSloppy); profile.Stop(QUDA_PROFILE_PREAMBLE); profile.Start(QUDA_PROFILE_COMPUTE); quda::blas_flops = 0; int steps_since_reliable = 1; const int maxResIncrease = 0; while(!convergence(r2, heavy_quark_res, stop, param.tol_hq) && k < param.maxiter){ matSloppy(Ap, *p, tmpSloppy); double sigma; bool breakdown = false; pAp = reDotProductCuda(*p,Ap); alpha = (K) ? rMinvr/pAp : r2/pAp; Complex cg_norm = axpyCGNormCuda(-alpha, Ap, rSloppy); // r --> r - alpha*A*p r2_old = r2; r2 = real(cg_norm); sigma = imag(cg_norm) >= 0.0 ? imag(cg_norm) : r2; // use r2 if (r_k+1, r_k-1 - r_k) breaks if(K) rMinvr_old = rMinvr; rNorm = sqrt(r2); if(rNorm > maxrx) maxrx = rNorm; if(rNorm > maxrr) maxrr = rNorm; int updateX = (rNorm < delta*r0Norm && r0Norm <= maxrx) ? 1 : 0; int updateR = ((rNorm < delta*maxrr && r0Norm <= maxrr) || updateX) ? 1 : 0; // force a reliable update if we are within target tolerance (only if doing reliable updates) if( convergence(r2, heavy_quark_res, stop, param.tol_hq) && delta >= param.tol) updateX = 1; if( !(updateR || updateX) ){ if(K){ r_new_Minvr_old = reDotProductCuda(rSloppy,*minvrSloppy); *rPre = rSloppy; globalReduce = false; (*K)(*minvrPre, *rPre); globalReduce = true; *minvrSloppy = *minvrPre; rMinvr = reDotProductCuda(rSloppy,*minvrSloppy); beta = (rMinvr - r_new_Minvr_old)/rMinvr_old; axpyZpbxCuda(alpha, *p, xSloppy, *minvrSloppy, beta); }else{ beta = sigma/r2_old; // use the alternative beta computation axpyZpbxCuda(alpha, *p, xSloppy, rSloppy, beta); } } else { // reliable update axpyCuda(alpha, *p, xSloppy); // xSloppy += alpha*p copyCuda(x, xSloppy); xpyCuda(x, y); // y += x // Now compute r mat(r, y, x); // x is just a temporary here r2 = xmyNormCuda(b, r); copyCuda(rSloppy, r); // copy r to rSloppy zeroCuda(xSloppy); // break-out check if we have reached the limit of the precision static int resIncrease = 0; if(sqrt(r2) > r0Norm && updateX) { // reuse r0Norm for this warningQuda("PCG: new reliable residual norm %e is greater than previous reliable residual norm %e", sqrt(r2), r0Norm); k++; rUpdate++; if(++resIncrease > maxResIncrease) break; }else{ resIncrease = 0; } rNorm = sqrt(r2); maxrr = rNorm; maxrx = rNorm; r0Norm = rNorm; ++rUpdate; if(K){ *rPre = rSloppy; globalReduce = false; (*K)(*minvrPre, *rPre); globalReduce = true; *minvrSloppy = *minvrPre; rMinvr = reDotProductCuda(rSloppy,*minvrSloppy); beta = rMinvr/rMinvr_old; xpayCuda(*minvrSloppy, beta, *p); // p = minvrSloppy + beta*p }else{ // standard CG - no preconditioning // explicitly restore the orthogonality of the gradient vector double rp = reDotProductCuda(rSloppy, *p)/(r2); axpyCuda(-rp, rSloppy, *p); beta = r2/r2_old; xpayCuda(rSloppy, beta, *p); steps_since_reliable = 0; } } breakdown = false; ++k; PrintStats("PCG", k, r2, b2, heavy_quark_res); } profile.Stop(QUDA_PROFILE_COMPUTE); profile.Start(QUDA_PROFILE_EPILOGUE); if(x.Precision() != param.precision_sloppy) copyCuda(x, xSloppy); xpyCuda(y, x); // x += y param.secs = profile.Last(QUDA_PROFILE_COMPUTE); double gflops = (quda::blas_flops + mat.flops() + matSloppy.flops() + matPrecon.flops())*1e-9; reduceDouble(gflops); param.gflops = gflops; param.iter += k; if (k==param.maxiter) warningQuda("Exceeded maximum iterations %d", param.maxiter); if (getVerbosity() >= QUDA_VERBOSE) printfQuda("CG: Reliable updates = %d\n", rUpdate); // compute the true residual mat(r, x, y); double true_res = xmyNormCuda(b, r); param.true_res = sqrt(true_res / b2); // reset the flops counters quda::blas_flops = 0; mat.flops(); matSloppy.flops(); matPrecon.flops(); profile.Stop(QUDA_PROFILE_EPILOGUE); profile.Start(QUDA_PROFILE_FREE); if(K){ // These are only needed if preconditioning is used delete minvrPre; delete rPre; delete minvr; if(x.Precision() != param.precision_sloppy) delete minvrSloppy; } delete p; if(x.Precision() != param.precision_sloppy){ delete x_sloppy; delete r_sloppy; } profile.Stop(QUDA_PROFILE_FREE); return; }