static int q_dirac_solver(lua_State *L) { CloverSolver *solver = lua_touserdata(L, lua_upvalueindex(1)); mClover *c = qlua_checkClover(L, lua_upvalueindex(2), NULL, 1); mLattice *S = qlua_ObjLattice(L, lua_upvalueindex(2)); int Sidx = lua_gettop(L); int relaxed_p; long long fl1; double t1; double out_eps; int out_iters; int log_level; switch (lua_type(L, 2)) { case LUA_TNONE: case LUA_TNIL: relaxed_p = 0; break; case LUA_TBOOLEAN: relaxed_p = lua_toboolean(L, 2); break; default: relaxed_p = 1; break; } if ((lua_type(L, 3) == LUA_TBOOLEAN) && (lua_toboolean(L, 3) != 0)) log_level = (QOP_CLOVER_LOG_CG_RESIDUAL | QOP_CLOVER_LOG_EIG_POSTAMBLE | QOP_CLOVER_LOG_EIG_UPDATE1); else log_level = 0; switch (qlua_qtype(L, 1)) { case qLatDirFerm3: { mLatDirFerm3 *psi = qlua_checkLatDirFerm3(L, 1, S, 3); mLatDirFerm3 *eta = qlua_newZeroLatDirFerm3(L, Sidx, 3); struct QOP_CLOVER_Fermion *c_psi; struct QOP_CLOVER_Fermion *c_eta; CL_D_env env; QLA_D_Real rhs_norm2 = 0; double rhs_n; int status; CALL_QDP(L); QDP_D3_r_eq_norm2_D(&rhs_norm2, psi->ptr, S->all); if (rhs_norm2 == 0) { lua_pushnumber(L, 0.0); lua_pushnumber(L, 0); lua_pushnumber(L, 0); lua_pushnumber(L, 0); return 5; } rhs_n = sqrt(rhs_norm2); env.lat = S->lat; env.f = QDP_D3_expose_D(psi->ptr); env.s = 1 / rhs_n; if (QOP_CLOVER_import_fermion(&c_psi, c->state, q_CL_D_reader_scaled, &env)) return luaL_error(L, "CLOVER_import_fermion() failed"); QDP_D3_reset_D(psi->ptr); if (QOP_CLOVER_allocate_fermion(&c_eta, c->state)) return luaL_error(L, "CLOVER_allocate_fermion() failed"); status = solver->proc(L, c_eta, &out_iters, &out_eps, c_psi, log_level); QOP_CLOVER_performance(&t1, &fl1, NULL, NULL, c->state); if (t1 == 0) t1 = -1; if (QDP_this_node == qlua_master_node) printf("CLOVER %s solver: status = %d," " eps = %.4e, iters = %d, time = %.3f sec," " perf = %.2f MFlops/sec\n", solver->name, status, out_eps, out_iters, t1, fl1 * 1e-6 / t1); env.lat = S->lat; env.f = QDP_D3_expose_D(eta->ptr); env.s = rhs_n; QOP_CLOVER_export_fermion(q_CL_D_writer_scaled, &env, c_eta); QDP_D3_reset_D(eta->ptr); QOP_CLOVER_free_fermion(&c_eta); QOP_CLOVER_free_fermion(&c_psi); if (status) { if (relaxed_p) return 0; else return luaL_error(L, QOP_CLOVER_error(c->state)); } /* eta is on the stack already */ lua_pushnumber(L, out_eps); lua_pushnumber(L, out_iters); lua_pushnumber(L, t1); lua_pushnumber(L, (double)fl1); return 5; } case qLatDirProp3: { mLatDirProp3 *psi = qlua_checkLatDirProp3(L, 1, S, 3); mLatDirProp3 *eta = qlua_newZeroLatDirProp3(L, Sidx, 3); struct QOP_CLOVER_Fermion *c_psi; struct QOP_CLOVER_Fermion *c_eta; int gstatus = 0; int status; qCL_P_env env; QLA_D_Real rhs_norm2 = 0; double rhs_n; lua_createtable(L, QOP_CLOVER_COLORS, 0); /* eps */ lua_createtable(L, QOP_CLOVER_COLORS, 0); /* iters */ CALL_QDP(L); if (QOP_CLOVER_allocate_fermion(&c_eta, c->state)) return luaL_error(L, "CLOVER_allocate_fermion() failed"); QDP_D3_r_eq_norm2_P(&rhs_norm2, psi->ptr, S->all); if (rhs_norm2 == 0) { return 3; } rhs_n = sqrt(rhs_norm2); env.lat = S->lat; env.in = QDP_D3_expose_P(psi->ptr); env.out = QDP_D3_expose_P(eta->ptr); for (env.c = 0; env.c < QOP_CLOVER_COLORS; env.c++) { lua_createtable(L, QOP_CLOVER_FERMION_DIM, 0); /* eps.c */ lua_createtable(L, QOP_CLOVER_FERMION_DIM, 0); /* iters.c */ for (env.d = 0; env.d < QOP_CLOVER_FERMION_DIM; env.d++) { env.s = 1 / rhs_n; if (QOP_CLOVER_import_fermion(&c_psi, c->state, q_CL_P_reader_scaled, &env)) return luaL_error(L, "CLOVER_import_fermion() failed"); status = solver->proc(L, c_eta, &out_iters, &out_eps, c_psi, log_level); QOP_CLOVER_performance(&t1, &fl1, NULL, NULL, c->state); if (t1 == 0) t1 = -1; if (QDP_this_node == qlua_master_node) printf("CLOVER %s solver: status = %d, c = %d, d = %d," " eps = %.4e, iters = %d, time = %.3f sec," " perf = %.2f MFlops/sec\n", solver->name, status, env.c, env.d, out_eps, out_iters, t1, fl1 * 1e-6 / t1); QOP_CLOVER_free_fermion(&c_psi); if (status) { if (relaxed_p) gstatus = 1; else return luaL_error(L, QOP_CLOVER_error(c->state)); } env.s = rhs_n; QOP_CLOVER_export_fermion(q_CL_P_writer_scaled, &env, c_eta); lua_pushnumber(L, out_eps); lua_rawseti(L, -3, env.d + 1); lua_pushnumber(L, out_iters); lua_rawseti(L, -2, env.d + 1); } lua_rawseti(L, -3, env.c + 1); lua_rawseti(L, -3, env.c + 1); } QDP_D3_reset_P(psi->ptr); QDP_D3_reset_P(eta->ptr); QOP_CLOVER_free_fermion(&c_eta); if (gstatus) return 0; else return 3; } default: break; } return luaL_error(L, "bad argument to CLOVER solver"); }
int bicgilu_cl_qop_single_for_double( int prop_type, QOP_FermionLinksWilson *qop_links, quark_invert_control *qic, int milc_parity, void *dmps[], float *kappas[], int nkappa[], QOP_DiracFermion **qop_sol[], QOP_DiracFermion *qop_src[], int nsrc, int *final_restart, Real *final_rsq_ptr ) { int i, iters, iters_F = 0; int converged; int nrestart; int max_restarts = qic->nrestart; int isrc, ikappa; int final_restart_F; Real final_rsq_F, final_relrsq_F; Real resid_F = 3e-7; /* The limits of a single precision inversion */ Real rel_F = 0; /* The limits of a single precision inversion */ QOP_invert_arg_t qop_invert_arg; QOP_resid_arg_t ***qop_resid_arg_F; QOP_info_t info_F = {0., 0., 0, 0, 0}, info = {0., 0., 0, 0, 0}; QDP_Subset subset = milc2qdp_subset(milc_parity); QOP_F3_FermionLinksWilson *qop_links_F; QOP_F3_DiracFermion **qop_sol_F[MAXSRC], *qop_rhs_F[MAXSRC]; QDP_F3_DiracFermion *qdp_rhs_F[MAXSRC]; QDP_D3_DiracFermion *qdp_src[MAXSRC], *qdp_resid[MAXSRC]; QDP_D3_DiracFermion *qdp_sol; Real relresid2[MAXSRC]; Real resid2[MAXSRC]; QLA_D_Real norm2_src[MAXSRC], norm2_resid[MAXSRC], norm_resid[MAXSRC], scale_resid; char myname[] = "bicgilu_cl_qop_single_for_double"; /* Only one kappa allowed per source for this algorithm */ for(i = 0; i < nsrc; i++){ if(nkappa[i] > 1){ printf("%s: nkappa[%d] = %d != 1\n",myname,i,nkappa[i]); terminate(1); } } /* Set qop_invert_arg */ /* We don't do restarts for the single precision step */ /* We interpret "qic->nrestart" to mean the max number of calls to the single-precision inverter */ set_qop_invert_arg_norestart( & qop_invert_arg, qic, milc_parity ); /* Pointers for residual errors */ /* For now we set the residual to something sensible for single precision */ qop_resid_arg_F = create_qop_resid_arg( nsrc, nkappa, resid_F*resid_F, rel_F*rel_F); /* Create a single precision copy of the links object */ qop_links_F = QOP_FD3_wilson_create_L_from_L( qop_links ); /* Take norm of source and create temporaries */ for(i = 0; i < nsrc; i++){ qdp_src[i] = QOP_D3_convert_D_to_qdp( qop_src[i] ); QDP_D3_r_eq_norm2_D( norm2_src+i, qdp_src[i], subset ); qdp_resid[i] = QDP_D3_create_D(); qdp_rhs_F[i] = QDP_F3_create_D(); qop_sol_F[i] = (QOP_F3_DiracFermion **)malloc(sizeof(QOP_F3_DiracFermion *)); } /* Main loop */ nrestart = 0; converged = 0; iters = 0; info.final_sec = -dclock(); info.final_flop = 0; info.status = QOP_SUCCESS; while(1){ /* Create new residual vectors from the result */ /* r = src - A sol */ compute_qdp_residuals( prop_type, qdp_resid, qdp_src, qop_links, qop_sol, dmps, kappas, nkappa, nsrc, milc_parity ); /* Compute two different norms */ qic->final_rsq = 0; qic->final_relrsq = 0; for(i = 0; i < nsrc; i++){ qdp_sol = QOP_convert_D_to_qdp( qop_sol[i][0] ); relresid2[i] = qdp_relative_residue( qdp_resid[i], qdp_sol, subset ); qop_sol[i][0] = QOP_convert_D_from_qdp( qdp_sol ); qic->final_relrsq = (relresid2[i] > qic->final_relrsq) ? relresid2[i] : qic->final_relrsq; QDP_D3_r_eq_norm2_D( norm2_resid+i, qdp_resid[i], subset ); resid2[i] = norm2_resid[i]/norm2_src[i]; qic->final_rsq = (resid2[i] > qic->final_rsq) ? resid2[i] : qic->final_rsq; #ifdef CG_DEBUG node0_printf("%s: double precision restart %d resid2 = %.2e vs %.2e relresid2 = %.2e vs %.2e\n", myname, nrestart, resid2[i], qic->resid * qic->resid, relresid2[i], qic->relresid * qic->relresid ); #endif } *final_rsq_ptr = qic->final_rsq; /* Use Cartesian norm for now */ *final_restart = nrestart; /* Stop when converged */ converged = 1; for(i = 0; i < nsrc; i++){ if((qic->resid > 0 && resid2[i] > qic->resid * qic->resid) || (qic->relresid > 0 && relresid2[i] > qic->relresid * qic->relresid)){ converged = 0; break; } } if(converged || nrestart++>=max_restarts)break; for(i = 0; i < nsrc; i++){ /* Scale the RHS to avoid underflow */ norm_resid[i] = sqrt(norm2_resid[i]); scale_resid = 1./norm_resid[i]; QDP_D3_D_eq_r_times_D(qdp_resid[i], &scale_resid, qdp_resid[i], subset); /* Scaled residual becomes the new source */ QDP_FD3_D_eq_D( qdp_rhs_F[i], qdp_resid[i], subset); qop_rhs_F[i] = QOP_F3_convert_D_from_qdp( qdp_rhs_F[i]); /* Prepare to solve in single precision by creating a single precision copy of the source. Set the trial solution to zero. */ qop_sol_F[i][0] = create_qop_DiracFermion_F(); } /* Solve in single precision */ double dtime = -dclock(); info_F.final_flop = 0.; bicgilu_cl_qop_generic_F( prop_type, &info_F, qop_links_F, &qop_invert_arg, qop_resid_arg_F, dmps, nkappa, qop_sol_F, qop_rhs_F, nsrc); dtime += dclock(); /* Report performance statistics */ /* For now we return the largest value and total iterations */ final_rsq_F = 0; final_relrsq_F = 0; final_restart_F = 0; iters_F = 0; for(isrc = 0; isrc < nsrc; isrc++) for(ikappa = 0; ikappa < nkappa[isrc]; ikappa++){ /* QOP routines return the ratios of the squared norms */ final_rsq_F = MAX(final_rsq_F, qop_resid_arg_F[isrc][ikappa]->final_rsq); final_relrsq_F = MAX(final_relrsq_F, qop_resid_arg_F[isrc][ikappa]->final_rel); final_restart_F = MAX(final_restart_F, qop_resid_arg_F[isrc][ikappa]->final_restart); iters_F += qop_resid_arg_F[isrc][ikappa]->final_iter; if(nsrc > 1 || nkappa[isrc] > 1) node0_printf("BICG(src %d,kappa %d): iters = %d resid = %e relresid = %e\n", isrc, ikappa, qop_resid_arg_F[isrc][ikappa]->final_iter, sqrt(qop_resid_arg_F[isrc][ikappa]->final_rsq), sqrt(qop_resid_arg_F[isrc][ikappa]->final_rel)); } #ifdef CGTIME node0_printf("%s: single precision iters = %d status %d final_rsq %.2e wanted %2e final_rel %.2e wanted %.2e\n", myname, iters_F, info_F.status, final_rsq_F, resid_F * resid_F, final_relrsq_F, rel_F); node0_printf("time = %g flops = %e mflops = %g\n", dtime, info_F.final_flop, info_F.final_flop/(1.0e6*dtime) ); fflush(stdout); #endif /* Add single-precision result to double precision solution (with rescaling) */ update_qop_solution( qop_sol, norm_resid, qop_sol_F, nsrc, subset ); for(i = 0; i < nsrc; i++){ QOP_F3_destroy_D(qop_sol_F[i][0]); /* Convert back */ qdp_rhs_F[i] = QOP_F3_convert_D_to_qdp(qop_rhs_F[i]); } info.final_flop += info_F.final_flop; iters += iters_F; } /* Clean up */ for(i = 0; i < nsrc; i++){ QDP_F3_destroy_D( qdp_rhs_F[i] ); QDP_D3_destroy_D( qdp_resid[i] ); /* Must restore qop_src in case the caller reuses it */ qop_src[i] = QOP_D3_convert_D_from_qdp( qdp_src[i] ); free(qop_sol_F[i]); } QOP_F3_wilson_destroy_L( qop_links_F ); destroy_qop_resid_arg(qop_resid_arg_F, nsrc, nkappa); qop_resid_arg_F = NULL; if(!converged){ node0_printf("%s: NOT Converged after %d iters and %d restarts\n", myname, iters, nrestart); } info.final_sec += dclock(); #ifdef CGTIME node0_printf("CGTIME: time = %e (wilson_qop FD) ", info.final_sec); for(isrc = 0; isrc < nsrc; isrc++) node0_printf("nkappa[%d] = %d tot_iters = %d ", isrc,nkappa[isrc],iters); node0_printf("mflops = %e\n", info.final_flop/(1.0e6*info.final_sec) ); fflush(stdout); #endif return iters; }