static int bicgilu_cl_qop(quark_invert_control *qic, Real clov, MYREAL *kappas[], int nkappa[], wilson_vector *milc_srcs[], wilson_vector **milc_sols[], int nsrc, int *final_restart, Real* final_rsq_ptr, int milc_parity ) { int isrc, ikappa; QOP_FermionLinksWilson *qop_links; QOP_DiracFermion **qop_sol[MAXSRC], *qop_src[MAXSRC]; int iterations_used = 0; QOP_invert_arg_t qop_invert_arg; QOP_resid_arg_t ***qop_resid_arg; double remaptime; int i; site *s; if(nsrc > MAXSRC){ printf("bicgilu_cl_qop: too many sources\n"); terminate(1); } /* Initialize QOP */ if(initialize_qop() != QOP_SUCCESS){ printf("bicbilu_cl_qop: Error initializing QOP\n"); terminate(1); } /* Create QOP links object */ qop_links = create_qop_wilson_fermion_links( clov ); /* Set qop_invert_arg */ set_qop_invert_arg( & qop_invert_arg, qic, milc_parity ); /* Pointers for residual errors */ qop_resid_arg = create_qop_resid_arg( nsrc, nkappa, (qic->resid)*(qic->resid)); remaptime = -dclock(); /* Pointers for solution vectors */ for(isrc = 0; isrc < nsrc; isrc++){ qop_sol[isrc] = (QOP_DiracFermion **)malloc(sizeof(QOP_DiracFermion *)*nkappa[isrc]); if(qop_sol[isrc] == NULL){ printf("bicgilu_cl_qop: Can't allocate qop_sol\n"); terminate(1); } } /* Map MILC source and sink to QOP fields */ for(isrc = 0; isrc < nsrc; isrc++){ gamma5_flip(milc_srcs[isrc], milc_parity); /* compensate for QOP gamma */ qop_src[isrc] = create_D_from_field( milc_srcs[isrc], milc_parity); gamma5_flip(milc_srcs[isrc], milc_parity); /* restore the source */ for(ikappa = 0; ikappa < nkappa[isrc]; ikappa++){ /* Adjust normalization for MILC conventions */ gamma5_flip(milc_sols[isrc][ikappa], milc_parity); /* compensate for QOP gamma */ FORALLSITES(i,s){ scalar_mult_wvec( milc_sols[isrc][ikappa]+i, 2.*kappas[isrc][ikappa], milc_sols[isrc][ikappa]+i); } qop_sol[isrc][ikappa] = create_D_from_field( milc_sols[isrc][ikappa], milc_parity); } }
int bicgilu_cl_qop_single_for_double( int prop_type, QOP_FermionLinksWilson *qop_links, quark_invert_control *qic, int milc_parity, void *dmps[], float *kappas[], int nkappa[], QOP_DiracFermion **qop_sol[], QOP_DiracFermion *qop_src[], int nsrc, int *final_restart, Real *final_rsq_ptr ) { int i, iters, iters_F = 0; int converged; int nrestart; int max_restarts = qic->nrestart; int isrc, ikappa; int final_restart_F; Real final_rsq_F, final_relrsq_F; Real resid_F = 3e-7; /* The limits of a single precision inversion */ Real rel_F = 0; /* The limits of a single precision inversion */ QOP_invert_arg_t qop_invert_arg; QOP_resid_arg_t ***qop_resid_arg_F; QOP_info_t info_F = {0., 0., 0, 0, 0}, info = {0., 0., 0, 0, 0}; QDP_Subset subset = milc2qdp_subset(milc_parity); QOP_F3_FermionLinksWilson *qop_links_F; QOP_F3_DiracFermion **qop_sol_F[MAXSRC], *qop_rhs_F[MAXSRC]; QDP_F3_DiracFermion *qdp_rhs_F[MAXSRC]; QDP_D3_DiracFermion *qdp_src[MAXSRC], *qdp_resid[MAXSRC]; QDP_D3_DiracFermion *qdp_sol; Real relresid2[MAXSRC]; Real resid2[MAXSRC]; QLA_D_Real norm2_src[MAXSRC], norm2_resid[MAXSRC], norm_resid[MAXSRC], scale_resid; char myname[] = "bicgilu_cl_qop_single_for_double"; /* Only one kappa allowed per source for this algorithm */ for(i = 0; i < nsrc; i++){ if(nkappa[i] > 1){ printf("%s: nkappa[%d] = %d != 1\n",myname,i,nkappa[i]); terminate(1); } } /* Set qop_invert_arg */ /* We don't do restarts for the single precision step */ /* We interpret "qic->nrestart" to mean the max number of calls to the single-precision inverter */ set_qop_invert_arg_norestart( & qop_invert_arg, qic, milc_parity ); /* Pointers for residual errors */ /* For now we set the residual to something sensible for single precision */ qop_resid_arg_F = create_qop_resid_arg( nsrc, nkappa, resid_F*resid_F, rel_F*rel_F); /* Create a single precision copy of the links object */ qop_links_F = QOP_FD3_wilson_create_L_from_L( qop_links ); /* Take norm of source and create temporaries */ for(i = 0; i < nsrc; i++){ qdp_src[i] = QOP_D3_convert_D_to_qdp( qop_src[i] ); QDP_D3_r_eq_norm2_D( norm2_src+i, qdp_src[i], subset ); qdp_resid[i] = QDP_D3_create_D(); qdp_rhs_F[i] = QDP_F3_create_D(); qop_sol_F[i] = (QOP_F3_DiracFermion **)malloc(sizeof(QOP_F3_DiracFermion *)); } /* Main loop */ nrestart = 0; converged = 0; iters = 0; info.final_sec = -dclock(); info.final_flop = 0; info.status = QOP_SUCCESS; while(1){ /* Create new residual vectors from the result */ /* r = src - A sol */ compute_qdp_residuals( prop_type, qdp_resid, qdp_src, qop_links, qop_sol, dmps, kappas, nkappa, nsrc, milc_parity ); /* Compute two different norms */ qic->final_rsq = 0; qic->final_relrsq = 0; for(i = 0; i < nsrc; i++){ qdp_sol = QOP_convert_D_to_qdp( qop_sol[i][0] ); relresid2[i] = qdp_relative_residue( qdp_resid[i], qdp_sol, subset ); qop_sol[i][0] = QOP_convert_D_from_qdp( qdp_sol ); qic->final_relrsq = (relresid2[i] > qic->final_relrsq) ? relresid2[i] : qic->final_relrsq; QDP_D3_r_eq_norm2_D( norm2_resid+i, qdp_resid[i], subset ); resid2[i] = norm2_resid[i]/norm2_src[i]; qic->final_rsq = (resid2[i] > qic->final_rsq) ? resid2[i] : qic->final_rsq; #ifdef CG_DEBUG node0_printf("%s: double precision restart %d resid2 = %.2e vs %.2e relresid2 = %.2e vs %.2e\n", myname, nrestart, resid2[i], qic->resid * qic->resid, relresid2[i], qic->relresid * qic->relresid ); #endif } *final_rsq_ptr = qic->final_rsq; /* Use Cartesian norm for now */ *final_restart = nrestart; /* Stop when converged */ converged = 1; for(i = 0; i < nsrc; i++){ if((qic->resid > 0 && resid2[i] > qic->resid * qic->resid) || (qic->relresid > 0 && relresid2[i] > qic->relresid * qic->relresid)){ converged = 0; break; } } if(converged || nrestart++>=max_restarts)break; for(i = 0; i < nsrc; i++){ /* Scale the RHS to avoid underflow */ norm_resid[i] = sqrt(norm2_resid[i]); scale_resid = 1./norm_resid[i]; QDP_D3_D_eq_r_times_D(qdp_resid[i], &scale_resid, qdp_resid[i], subset); /* Scaled residual becomes the new source */ QDP_FD3_D_eq_D( qdp_rhs_F[i], qdp_resid[i], subset); qop_rhs_F[i] = QOP_F3_convert_D_from_qdp( qdp_rhs_F[i]); /* Prepare to solve in single precision by creating a single precision copy of the source. Set the trial solution to zero. */ qop_sol_F[i][0] = create_qop_DiracFermion_F(); } /* Solve in single precision */ double dtime = -dclock(); info_F.final_flop = 0.; bicgilu_cl_qop_generic_F( prop_type, &info_F, qop_links_F, &qop_invert_arg, qop_resid_arg_F, dmps, nkappa, qop_sol_F, qop_rhs_F, nsrc); dtime += dclock(); /* Report performance statistics */ /* For now we return the largest value and total iterations */ final_rsq_F = 0; final_relrsq_F = 0; final_restart_F = 0; iters_F = 0; for(isrc = 0; isrc < nsrc; isrc++) for(ikappa = 0; ikappa < nkappa[isrc]; ikappa++){ /* QOP routines return the ratios of the squared norms */ final_rsq_F = MAX(final_rsq_F, qop_resid_arg_F[isrc][ikappa]->final_rsq); final_relrsq_F = MAX(final_relrsq_F, qop_resid_arg_F[isrc][ikappa]->final_rel); final_restart_F = MAX(final_restart_F, qop_resid_arg_F[isrc][ikappa]->final_restart); iters_F += qop_resid_arg_F[isrc][ikappa]->final_iter; if(nsrc > 1 || nkappa[isrc] > 1) node0_printf("BICG(src %d,kappa %d): iters = %d resid = %e relresid = %e\n", isrc, ikappa, qop_resid_arg_F[isrc][ikappa]->final_iter, sqrt(qop_resid_arg_F[isrc][ikappa]->final_rsq), sqrt(qop_resid_arg_F[isrc][ikappa]->final_rel)); } #ifdef CGTIME node0_printf("%s: single precision iters = %d status %d final_rsq %.2e wanted %2e final_rel %.2e wanted %.2e\n", myname, iters_F, info_F.status, final_rsq_F, resid_F * resid_F, final_relrsq_F, rel_F); node0_printf("time = %g flops = %e mflops = %g\n", dtime, info_F.final_flop, info_F.final_flop/(1.0e6*dtime) ); fflush(stdout); #endif /* Add single-precision result to double precision solution (with rescaling) */ update_qop_solution( qop_sol, norm_resid, qop_sol_F, nsrc, subset ); for(i = 0; i < nsrc; i++){ QOP_F3_destroy_D(qop_sol_F[i][0]); /* Convert back */ qdp_rhs_F[i] = QOP_F3_convert_D_to_qdp(qop_rhs_F[i]); } info.final_flop += info_F.final_flop; iters += iters_F; } /* Clean up */ for(i = 0; i < nsrc; i++){ QDP_F3_destroy_D( qdp_rhs_F[i] ); QDP_D3_destroy_D( qdp_resid[i] ); /* Must restore qop_src in case the caller reuses it */ qop_src[i] = QOP_D3_convert_D_from_qdp( qdp_src[i] ); free(qop_sol_F[i]); } QOP_F3_wilson_destroy_L( qop_links_F ); destroy_qop_resid_arg(qop_resid_arg_F, nsrc, nkappa); qop_resid_arg_F = NULL; if(!converged){ node0_printf("%s: NOT Converged after %d iters and %d restarts\n", myname, iters, nrestart); } info.final_sec += dclock(); #ifdef CGTIME node0_printf("CGTIME: time = %e (wilson_qop FD) ", info.final_sec); for(isrc = 0; isrc < nsrc; isrc++) node0_printf("nkappa[%d] = %d tot_iters = %d ", isrc,nkappa[isrc],iters); node0_printf("mflops = %e\n", info.final_flop/(1.0e6*info.final_sec) ); fflush(stdout); #endif return iters; }