/* ttt <- ttt - msq_x4*src (msq = mass squared) */ FORSOMEPARITY(i,s,l_parity){ #ifdef CONGRAD_TMP_VECTORS if( i < loopend-FETCH_UP ){ prefetch_VVVV( &ttt[i+FETCH_UP], &t_dest[i+FETCH_UP], (su3_vector *)F_PT(s+FETCH_UP,src), &resid[i+FETCH_UP]); } scalar_mult_add_su3_vector( &ttt[i], &t_dest[i], -msq_x4, &ttt[i] ); /* note that we go back to the site structure for src */ add_su3_vector( (su3_vector *)F_PT(s,src), &ttt[i], &resid[i] ); /* remember ttt contains -M_adjoint*M*src */ cg_p[i] = resid[i]; /* note that we go back to the site structure for src */ source_norm += (double)magsq_su3vec( (su3_vector *)F_PT(s,src) ); rsq += (double)magsq_su3vec( &resid[i] ); #else if( i < loopend-FETCH_UP ){ prefetch_VVVV( &((s+FETCH_UP)->ttt), (su3_vector *)F_PT(s+FETCH_UP,dest), (su3_vector *)F_PT(s+FETCH_UP,src), &((s+FETCH_UP)->resid)); } scalar_mult_add_su3_vector( &(s->ttt), (su3_vector *)F_PT(s,dest), -msq_x4, &(s->ttt) ); add_su3_vector( (su3_vector *)F_PT(s,src), &(s->ttt), &(s->resid) ); s->cg_p = s->resid; source_norm += (double) magsq_su3vec( (su3_vector *)F_PT(s,src) ); rsq += (double) magsq_su3vec( &(s->resid) ); #endif } END_LOOP
FORSOMEPARITY(i,s,l_parity) { scalar_mult_add_su3_vector( &temp[i],&init_guess[i],msq_xm4,&temp[i] ); add_su3_vector((su3_vector *)F_PT(s,src1),&temp[i],&common_source[i] ); source_norm += (double)magsq_su3vec( &common_source[i] ); source_norm1 += (double)magsq_su3vec( (su3_vector *)F_PT(s,src1) ); /*pm_strange[i] = cg_p[i] = resid[i] = common_source[i];*/ su3vec_copy( &common_source[i],&(resid[i]) ); su3vec_copy(&(resid[i]),&(cg_p[i]) ); su3vec_copy(&(resid[i]), &pm_strange[i]); su3vec_copy(&init_guess[i],&destvec1[i] ); su3vec_copy(&init_guess[i],&destvec2[i] ); } END_LOOP
static Real relative_residue(su3_vector *p, su3_vector *q, int parity) { double residue, num, den; int i; site *s; residue = 0; FORSOMEPARITY(i,s,parity){ num = (double)magsq_su3vec( &(p[i]) ); den = (double)magsq_su3vec( &(q[i]) ); residue += (den==0) ? 1.0 : (num/den); } END_LOOP
// residues, roots and order define rational function approximation for // x^(nf/8) void grsource_imp_rhmc( field_offset dest, params_ratfunc *rf, int parity, su3_vector **multi_x, su3_vector *sumvec, Real my_rsqmin, int my_niter, int my_prec, ferm_links_t *fn) { register int i,j; register site *s; Real final_rsq; int order = rf->order; Real *residues = rf->res; Real *roots = rf->pole; /*TEMP*/ double sum; sum=0.0; FORSOMEPARITY(i,s,parity){ for(j=0;j<3;j++){ #ifdef SITERAND s->g_rand.c[j].real = gaussian_rand_no(&(s->site_prn)); s->g_rand.c[j].imag = gaussian_rand_no(&(s->site_prn)); #else s->g_rand.c[j].real = gaussian_rand_no(&node_prn); s->g_rand.c[j].imag = gaussian_rand_no(&node_prn); #endif } /*TEMP*/ sum += (double)magsq_su3vec( &(s->g_rand) ); } /*TEMP*/g_doublesum( &sum); node0_printf("GRSOURCE: sum = %.10e\n",sum); ks_ratinv( F_OFFSET(g_rand), multi_x, roots, order, my_niter, my_rsqmin, my_prec, parity, &final_rsq, fn ); ks_rateval( sumvec, F_OFFSET(g_rand), multi_x, residues, order, parity ); FORSOMEPARITY(i,s,parity){ *(su3_vector *)F_PT(s,dest) = sumvec[i]; }
/* Returns the 2-norm of a fermion vector */ static void norm2(su3_vector *vec, double *norm, int parity){ register double n ; register site *s; register int i; n=0 ; FORSOMEPARITY(i,s,parity){ n += magsq_su3vec(&(vec[i])); }
/* The Fermilab relative residue */ static Real my_relative_residue(su3_vector *p, su3_vector *q, int parity){ register int i; double residue, num, den; residue = (double)0.0; FORSOMEFIELDPARITY_OMP(i, parity, private(num,den) reduction(+:residue)){ num = (double)magsq_su3vec(p+i); den = (double)magsq_su3vec(q+i); residue += (den==0) ? 1.0 : (num/den); } END_LOOP_OMP g_doublesum(&residue); if(parity == EVENANDODD) return sqrt(residue/volume); else return sqrt(2*residue/volume); }
Real magsq_wvec( wilson_vector *vec ){ register int i; register Real sum; sum=0.0; for(i=0;i<4;i++)sum += magsq_su3vec( &(vec->d[i]) ); return(sum); #else /* Fast version */ Real magsq_wvec( wilson_vector *vec ){ #ifdef NATIVEDOUBLE register double ar,ai,sum; #else register Real ar,ai,sum; #endif ar=vec->d[0].c[0].real; ai=vec->d[0].c[0].imag; sum = ar*ar + ai*ai; ar=vec->d[0].c[1].real; ai=vec->d[0].c[1].imag; sum += ar*ar + ai*ai; ar=vec->d[0].c[2].real; ai=vec->d[0].c[2].imag; sum += ar*ar + ai*ai; ar=vec->d[1].c[0].real; ai=vec->d[1].c[0].imag; sum += ar*ar + ai*ai; ar=vec->d[1].c[1].real; ai=vec->d[1].c[1].imag; sum += ar*ar + ai*ai; ar=vec->d[1].c[2].real; ai=vec->d[1].c[2].imag; sum += ar*ar + ai*ai; ar=vec->d[2].c[0].real; ai=vec->d[2].c[0].imag; sum += ar*ar + ai*ai; ar=vec->d[2].c[1].real; ai=vec->d[2].c[1].imag; sum += ar*ar + ai*ai; ar=vec->d[2].c[2].real; ai=vec->d[2].c[2].imag; sum += ar*ar + ai*ai; ar=vec->d[3].c[0].real; ai=vec->d[3].c[0].imag; sum += ar*ar + ai*ai; ar=vec->d[3].c[1].real; ai=vec->d[3].c[1].imag; sum += ar*ar + ai*ai; ar=vec->d[3].c[2].real; ai=vec->d[3].c[2].imag; sum += ar*ar + ai*ai; return((Real)sum); #endif }
static void ks_multicg_reverse_field( /* Return value is number of iterations taken */ su3_vector *src, /* source vector (type su3_vector) */ su3_vector **psim, /* solution vectors */ ks_param *ksp, /* KS parametes, including the offsets */ int num_offsets, /* number of offsets */ quark_invert_control *qic, imp_ferm_links_t *fn /* Storage for fermion links */ ) { char myname[] = "ks_multicg_reverse_field"; /* Site su3_vector's resid, cg_p and ttt are used as temporaies */ register int i; register site *s; int iteration; /* counter for iterations */ int num_offsets_now; /* number of offsets still being worked on */ double c1, c2, rsq, oldrsq, pkp; /* pkp = cg_p.K.cg_p */ double source_norm; /* squared magnitude of source vector */ double rsqstop; /* stopping residual normalized by source norm */ int l_parity=0; /* parity we are currently doing */ int l_otherparity=0; /* the other parity */ #ifdef FN msg_tag *tags1[16], *tags2[16]; /* tags for gathers to parity and opposite */ #endif int special_started; /* 1 if dslash_special has been called */ int j, j_low; Real *shifts, mass_low, msq_xm4; double *zeta_i, *zeta_im1, *zeta_ip1; double *beta_i, *beta_im1, *alpha; // su3_vector **pm; /* vectors not involved in gathers */ // Switch indices su3_vector **psim_rev; su3_vector *psim_space; su3_vector **pm_rev; su3_vector *pm_space; /* Unpack structure */ /* We don't restart this algorithm, so we adopt the convention of taking the product here */ int niter = qic->max*qic->nrestart; Real rsqmin = qic->resid * qic->resid; /* desired squared residual - normalized as sqrt(r*r)/sqrt(src_e*src_e) */ int parity = qic->parity; /* EVEN, ODD */ /* Timing */ #ifdef CGTIME double dtimec; #endif double nflop; qic->final_iters = 0; qic->final_restart = 0; //#if FERM_ACTION == HISQ // fn->hl.current_X_set = 0; // restore_fn_links(fn); //#endif if( num_offsets==0 )return; if(fn == NULL){ printf("%s(%d): Called with NULL fn\n", myname, this_node); terminate(1); } // Switch indices psim_rev = (su3_vector **)malloc( sizeof(su3_vector *)*sites_on_node ); psim_space = (su3_vector *)malloc( sizeof(su3_vector)*sites_on_node*num_offsets ); pm_rev = (su3_vector **)malloc( sizeof(su3_vector *)*sites_on_node ); pm_space = (su3_vector *)malloc( sizeof(su3_vector)*sites_on_node*num_offsets ); if( psim_space == NULL || pm_space == NULL){printf("%s: NO ROOM!\n",myname); exit(0); } for( i=0; i<sites_on_node; i++ ){ psim_rev[i] = &(psim_space[num_offsets*i]); pm_rev[i] = &(pm_space[num_offsets*i]); for( j=0; j<num_offsets; j++){ psim_rev[i][j] = psim[j][i]; } } /* debug */ #ifdef CGTIME dtimec = -dclock(); #endif nflop = 1205 + 15*num_offsets; if(parity==EVENANDODD)nflop *=2; special_started = 0; /* if we want both parities, we will do even first. */ switch(parity){ case(EVEN): l_parity=EVEN; l_otherparity=ODD; break; case(ODD): l_parity=ODD; l_otherparity=EVEN; break; case(EVENANDODD): l_parity=EVEN; l_otherparity=ODD; break; } shifts = (Real *)malloc(num_offsets*sizeof(Real)); zeta_i = (double *)malloc(num_offsets*sizeof(double)); zeta_im1 = (double *)malloc(num_offsets*sizeof(double)); zeta_ip1 = (double *)malloc(num_offsets*sizeof(double)); beta_i = (double *)malloc(num_offsets*sizeof(double)); beta_im1 = (double *)malloc(num_offsets*sizeof(double)); alpha = (double *)malloc(num_offsets*sizeof(double)); //pm = (su3_vector **)malloc(num_offsets*sizeof(su3_vector *)); mass_low = 1.0e+20; j_low = -1; for(j=0;j<num_offsets;j++){ shifts[j] = ksp[j].offset; if (ksp[j].offset < mass_low){ mass_low = ksp[j].offset; j_low = j; } } for(j=0;j<num_offsets;j++) if(j!=j_low){ //pm[j] = (su3_vector *)malloc(sites_on_node*sizeof(su3_vector)); shifts[j] -= shifts[j_low]; } msq_xm4 = -shifts[j_low]; iteration = 0; #define PAD 0 /* now we can allocate temporary variables and copy then */ /* PAD may be used to avoid cache thrashing */ if(first_multicongrad) { ttt = (su3_vector *) malloc((sites_on_node+PAD)*sizeof(su3_vector)); cg_p = (su3_vector *) malloc((sites_on_node+PAD)*sizeof(su3_vector)); resid = (su3_vector *) malloc((sites_on_node+PAD)*sizeof(su3_vector)); first_multicongrad = 0; } #ifdef CGTIME dtimec = -dclock(); #endif /* initialization process */ start: #ifdef FN if(special_started==1) { /* clean up gathers */ cleanup_gathers(tags1, tags2); special_started = 0; } #endif num_offsets_now = num_offsets; source_norm = 0.0; FORSOMEPARITY(i,s,l_parity){ source_norm += (double) magsq_su3vec( src+i ); su3vec_copy( src+i, &(resid[i])); su3vec_copy(&(resid[i]), &(cg_p[i])); clearvec(&(psim_rev[i][j_low])); for(j=0;j<num_offsets;j++) if(j!=j_low){ clearvec(&(psim_rev[i][j])); su3vec_copy(&(resid[i]), &(pm_rev[i][j])); } } END_LOOP;
/* Hadron wave functions. */ void wavefunc_t() { register int i,j,n; register site *s; register complex cc; msg_tag *tag; Real finalrsq,scale,x; int tmin,tmax,cgn,color; /* for baryon code */ int ca,ca1,ca2,cb,cb1,cb2; void symmetry_combine(field_offset src,field_offset space,int size,int dir); void block_fourier( field_offset src, /* src is field to be transformed */ field_offset space, /* space is working space, same size as src */ int size, /* Size of field in bytes. The field must consist of size/sizeof(complex) consecutive complex numbers. For example, an su3_vector is 3 complex numbers. */ int isign); /* 1 for x -> k, -1 for k -> x */ void fourier( field_offset src, /* src is field to be transformed */ field_offset space, /* space is working space, same size as src */ int size, /* Size of field in bytes. The field must consist of size/sizeof(complex) consecutive complex numbers. For example, an su3_vector is 3 complex numbers. */ int isign); /* 1 for x -> k, -1 for k -> x */ void write_wf(field_offset src,char *string,int tmin,int tmax); /* Fix TUP Coulomb gauge - gauge links only*/ rephase( OFF ); gaugefix(TUP,(Real)1.8,500,(Real)GAUGE_FIX_TOL); rephase( ON ); for(color=0;color<3;color++){ /* Make wall source */ FORALLSITES(i,s){ for(j=0;j<3;j++)s->phi.c[j]=cmplx(0.0,0.0); if( s->x%2==0 && s->y%2==0 && s->z%2==0 && s->t==0 ){ s->phi.c[color] = cmplx(-1.0,0.0); } } /* do a C.G. (source in phi, result in xxx) */ load_ferm_links(&fn_links); cgn = ks_congrad(F_OFFSET(phi),F_OFFSET(xxx),mass, niter, rsqprop, PRECISION, EVEN, &finalrsq, &fn_links); /* Multiply by -Madjoint, result in propmat[color] */ dslash_site( F_OFFSET(xxx), F_OFFSET(propmat[color]), ODD, &fn_links); scalar_mult_latvec( F_OFFSET(xxx), (Real)(-2.0*mass), F_OFFSET(propmat[color]), EVEN); } /* construct the diquark propagator--uses tempmat1 and do this before you fft the quark propagator */ FORALLSITES(i,s){ for(ca=0;ca<3;ca++)for(cb=0;cb<3;cb++){ ca1= (ca+1)%3; ca2= (ca+2)%3; cb1= (cb+1)%3; cb2= (cb+2)%3; CMUL((s->propmat[ca1].c[cb1]),(s->propmat[ca2].c[cb2]), (s->tempmat1.e[ca][cb])); CMUL((s->propmat[ca1].c[cb2]),(s->propmat[ca2].c[cb1]), cc); CSUB((s->tempmat1.e[ca][cb]),cc,(s->tempmat1.e[ca][cb])); } } /* complex conjugate the diquark prop */ FORALLSITES(i,s){ for(ca=0;ca<3;ca++)for(cb=0;cb<3;cb++){ CONJG((s->tempmat1.e[ca][cb]),(s->tempmat1.e[ca][cb])); } } /* Transform the diquark propagator. */ block_fourier( F_OFFSET(tempmat1), F_OFFSET(tempvec[0]), 3*sizeof(su3_vector), FORWARDS); /* complex conjugate the diquark prop. Now we have D(-k) for convolution */ FORALLSITES(i,s){ for(ca=0;ca<3;ca++)for(cb=0;cb<3;cb++){ CONJG((s->tempmat1.e[ca][cb]),(s->tempmat1.e[ca][cb])); } } /* Transform the propagator. */ block_fourier( F_OFFSET(propmat[0]), F_OFFSET(tempvec[0]), 3*sizeof(su3_vector), FORWARDS); /* CODE SPECIFIC TO PARTICULAR PARTICLES */ /* MESON CODE */ /* Square the result, component by component, sum over source and sink colors, result in ttt.c[0] */ FORALLSITES(i,s){ s->ttt.c[0].real = s->ttt.c[0].imag = 0.0; for(color=0;color<3;color++){ s->ttt.c[0].real += magsq_su3vec( &(s->propmat[color]) ); } }
int ks_congrad_parity_cpu( su3_vector *t_src, su3_vector *t_dest, quark_invert_control *qic, Real mass, imp_ferm_links_t *fn){ register int i; register site *s; int iteration; /* counter for iterations */ Real a,b; /* Sugar's a,b */ #ifdef FEWSUMS double actual_rsq = 999.; /* rsq from actual summation of resid */ double c_tr,c_tt,tempsum[4]; /* Re<resid|ttt>, <ttt|ttt> */ #endif double rsq = 0,relrsq = 1.; /* resid**2, rel resid*2 */ double oldrsq,pkp; /*last resid*2,pkp = cg_p.K.cg_p */ Real msq_x4; /* 4*mass*mass */ double source_norm; /* squared magnitude of source vector */ int otherparity = 0; /* the other parity */ msg_tag * tags1[16], *tags2[16]; /* tags for gathers to parity and opposite */ int special_started = 0; /* 1 if dslash_fn_field_special has been called */ int nrestart; /* Restart counter */ su3_vector *ttt, *cg_p, *resid; #ifdef CGTIME double nflop = 1187; #endif double dtimec; char myname[] = "ks_congrad_parity_cpu"; /* Unpack structure */ int niter = qic->max; /* maximum number of iters per restart */ int max_restarts = qic->nrestart; /* maximum restarts */ Real rsqmin = qic->resid * qic->resid; /* desired residual - normalized as sqrt(r*r)/sqrt(src_e*src_e) */ Real relrsqmin = qic->relresid * qic->relresid; /* desired relative residual (FNAL)*/ int parity = qic->parity; /* EVEN, ODD */ int max_cg = max_restarts*niter; /* Maximum number of iterations */ if(fn == NULL){ printf("%s(%d): Called with NULL fn\n", myname, this_node); terminate(1); } dtimec = -dclock(); msq_x4 = 4.0*mass*mass; switch(parity){ case(EVEN): otherparity=ODD; break; case(ODD): otherparity=EVEN; break; } /* Source norm */ source_norm = 0.0; FORSOMEFIELDPARITY_OMP(i,parity,reduction(+:source_norm)){ source_norm += (double)magsq_su3vec( &t_src[i] ); } END_LOOP_OMP g_doublesum( &source_norm ); #ifdef CG_DEBUG node0_printf("congrad: source_norm = %e\n", (double)source_norm); #endif /* Start CG iterations */ nrestart = 0; iteration = 0; qic->size_r = 0; qic->size_relr = 1.; qic->final_iters = 0; qic->final_restart = 0; qic->converged = 1; qic->final_rsq = 0.; qic->final_relrsq = 0.; /* Provision for trivial solution */ if(source_norm == 0.0){ /* Zero the solution, free space, and return zero iterations */ FORSOMEFIELDPARITY_OMP(i,parity,default(shared)){ memset(t_dest + i, 0, sizeof(su3_vector)); } END_LOOP_OMP dtimec += dclock(); #ifdef CGTIME if(this_node==0){ printf("CONGRAD5: time = %e (fn %s) masses = 1 iters = %d mflops = %e\n", dtimec, prec_label[PRECISION-1], qic->final_iters, ((double)nflop*volume*qic->final_iters)/(1.0e6*dtimec*numnodes()) ); fflush(stdout);} #endif return 0; }
/* Assume the first Nvecs_curr eigenvectors have been already orthonormalized. If norm of an eigenvector is less than ORTHO_EPS, remove it. Rturn the number of new eigenvectors to be added. */ static int orthogonalize(int Nvecs, int Nvecs_curr, su3_vector **eigVec, int parity){ register int i; int j, k, Nvecs_add, n; double norm; double_complex cc; double_complex *c; j = Nvecs_curr; Nvecs_add = Nvecs; n = Nvecs_curr + Nvecs_add; c = (double_complex *)malloc(n*sizeof(double_complex)); while(j < n){ /* Modified Gram-Schmidt Orthogonality is better but more communications are needed */ for(k = 0; k < j; k++){ // c[k] = dcmplx((double)0.0,(double)0.0); // FORSOMEFIELDPARITY_OMP(i, parity, private(cc) reduction(+:c[k])){ // cc = su3_dot(eigVec[k]+i, eigVec[j]+i); // CSUM(c[k], cc); // } END_LOOP_OMP; double cctotr=0., cctoti=0.; FORSOMEFIELDPARITY_OMP(i, parity, private(cc) reduction(+:cctotr,cctoti)){ cc = su3_dot(eigVec[k]+i, eigVec[j]+i); cctotr += cc.real; cctoti += cc.imag; } END_LOOP_OMP; c[k].real = cctotr; c[k].imag = cctoti; g_dcomplexsum(c+k); FORSOMEFIELDPARITY_OMP(i, parity, default(shared)){ c_scalar_mult_sub_su3vec(eigVec[j]+i, c+k, eigVec[k]+i); } END_LOOP_OMP } /* Gram-Schmidt Less communications but poor orthogonality might happen if the number of vectors is too large. */ /* for(k = 0; k < j; k++){ c[k] = dcmplx((double)0.0,(double)0.0); FORSOMEFIELDPARITY_OMP(i, parity, private(cc) reduction(+:c[k])){ cc = su3_dot(eigVec[k]+i, eigVec[j]+i); CSUM(c[k], cc); } END_LOOP_OMP } g_vecdcomplexsum(c, j); for(k = 0; k < j; k++){ FORSOMEFIELDPARITY_OMP(i, parity, default(shared)){ c_scalar_mult_sub_su3vec(eigVec[j]+i, c+k, eigVec[k]+i); } END_LOOP_OMP } */ norm = (double)0.0; FORSOMEFIELDPARITY_OMP(i, parity, reduction(+:norm)){ norm += magsq_su3vec(eigVec[j]+i); } END_LOOP_OMP g_doublesum(&norm); norm = sqrt(norm); if( norm < ORTHO_EPS ){ Nvecs_add--; n--; for(k = j; k < n; k++){ FORSOMEFIELDPARITY_OMP(i, parity, default(shared)){ eigVec[k][i] = eigVec[k+1][i]; } END_LOOP_OMP } } else{