// f += coeff * U_dir(x) * v(x)^dagger // Note that STAG order stores hermitian conjugate of links. void Fasqtad::force_product_sum(const Matrix *v, int dir, IFloat coeff, Matrix *f){ unsigned long v2=(unsigned long)v; if( qalloc_is_fast((Matrix *)v) && qalloc_is_fast((Matrix *)f) && qalloc_is_fast((Matrix *)(GaugeField()+dir))) v2 = v2 - 0xb0000000 + 0x9c000000; int vol = node_sites[0]*node_sites[1]*node_sites[2]*node_sites[3]; ForceFlops += vol * 234; fgdagm1dagpm2(f, &coeff, (GaugeField()+dir), (const Matrix*)v2, f, &vol); }
//------------------------------------------------------------------------------ // GforceSite(Matrix& force, int *x, int mu): // It calculates the gauge force at site x and direction mu. //------------------------------------------------------------------------------ void GimprRect::GforceSite(Matrix& force, int *x, int mu) { const char *fname = "GforceSite(M&,i*,i)"; setCbufCntrlReg(4, CBUF_MODE4); Matrix *u_off = GaugeField()+GsiteOffset(x)+mu; Matrix mt1; //---------------------------------------------------------------------------- // get staple // mt1 = staple //---------------------------------------------------------------------------- Staple(mt1, x, mu); ForceFlops += 198*3*3+12+216*3; //---------------------------------------------------------------------------- // mt2 = U_mu(x) //---------------------------------------------------------------------------- Matrix mt2(*u_off); // moveMem((IFloat *)mp2, (IFloat *)u_off, MATRIX_SIZE * sizeof(IFloat)) ; //---------------------------------------------------------------------------- // force = -(beta*(1-8*c_1)/3)*U_mu(x)*stap //---------------------------------------------------------------------------- force.DotMEqual(mt2, mt1); force *= plaq_coeff; // mDotMEqual((IFloat *)&force, (const IFloat *)mp2, (const IFloat *)mp1); // tmp = plaq_coeff ; // vecTimesEquFloat((IFloat *)&force, tmp, MATRIX_SIZE); //---------------------------------------------------------------------------- // get rectangle staple // mt1 = rect_stap //---------------------------------------------------------------------------- RectStaple(mt1, x, mu); ForceFlops += 198*3*18+216*3*6; //---------------------------------------------------------------------------- // mt2 = -(beta*c_1/3)*U_mu(x) //---------------------------------------------------------------------------- // mt2 = *u_off; // moveMem((IFloat *)mp2, (IFloat *)u_off, MATRIX_SIZE*sizeof(IFloat)); mt2 *= rect_coeff; // tmp = rect_coeff; // vecTimesEquFloat((IFloat *)mp2, tmp, MATRIX_SIZE) ; ForceFlops +=234; //---------------------------------------------------------------------------- // force += -(beta*c_1/3)*U_mu(x)*rect_stap //---------------------------------------------------------------------------- force.DotMPlus(mt2, mt1); // mDotMPlus((IFloat *)&force, (const IFloat *)mp2, (const IFloat *)mp1); mt1.Dagger(force); force.TrLessAntiHermMatrix(mt1); ForceFlops +=198+24; }
GaugeField Action_Nf2_ratio::md_force(){ #ifdef ANTIPERIODIC_BC BC->apply_bc(*u_); #endif Field eta = DdagD1_inv(D2_->mult_dag(phi_)); long double timing; FINE_TIMING_START(timing); GaugeField fce(D1_->md_force(eta,D1_->mult(eta))); fce -= GaugeField(D2_->md_force(eta,phi_)); FINE_TIMING_END(timing); _Message(TIMING_VERB_LEVEL, "[Timing] - Action_Nf2_ratio::md_force" << " - Force terms timing = " << timing << std::endl); #ifdef ANTIPERIODIC_BC BC->apply_bc(*u_); #endif FINE_TIMING_START(timing); if(smeared_) smart_conf_->smeared_force(fce); FINE_TIMING_END(timing); _Message(TIMING_VERB_LEVEL, "[Timing] - Action_Nf2_ratio::md_force" << " - Smeared force timing = " << timing << std::endl); FINE_TIMING_START(timing); GaugeField force = FieldUtils::TracelessAntihermite(fce); FINE_TIMING_END(timing); _Message(TIMING_VERB_LEVEL, "[Timing] - Action_Nf2_ratio::md_force" << " - TracelessAntihermite timing = " << timing << std::endl); _MonitorMsg(ACTION_VERB_LEVEL, Action,force, name_); return force; }
//------------------------------------------------------------------ // "Odd" fermion force evolution routine written by Chris Dawson, taken // verbatim, so performance will suck on qcdoc. //------------------------------------------------------------------ ForceArg FdwfBase::EvolveMomFforce( Matrix* mom, // momenta Vector* phi, // odd pseudo-fermion field Vector* eta, // very odd pseudo-fermion field Float mass, Float dt ) { char *fname = "EvolveMomFforce(M*,V*,V*,F,F)"; VRB.Func(cname,fname); Matrix *gauge = GaugeField() ; if (Colors() != 3) { ERR.General(cname,fname,"Wrong nbr of colors.") ; } if (SpinComponents()!=4) { ERR.General(cname,fname,"Wrong nbr of spin comp.") ;} if (mom == 0) { ERR.Pointer(cname,fname,"mom") ; } if (phi == 0) { ERR.Pointer(cname,fname,"phi") ; } // allocate space for two CANONICAL fermion fields // these are all full fermion vector sizes ( i.e. *not* preconditioned ) const int f_size ( FsiteSize() * GJP.VolNodeSites() ); const int f_size_cb ( f_size/2 ) ; // f_size must be multiple of 2 const int f_site_size_4d( 2 * Colors() * SpinComponents() ); const int f_size_4d ( f_site_size_4d * GJP.VolNodeSites()) ; char *str_v1 = "v1" ; Vector *v1 = (Vector *)smalloc(f_size*sizeof(Float)) ; if (v1 == 0) ERR.Pointer(cname, fname, str_v1) ; VRB.Smalloc(cname, fname, str_v1, v1, f_size*sizeof(Float)) ; char *str_v2 = "v2" ; Vector *v2 = (Vector *)smalloc(f_size*sizeof(Float)) ; if (v2 == 0) ERR.Pointer(cname, fname, str_v2) ; VRB.Smalloc(cname, fname, str_v2, v2, f_size*sizeof(Float)) ; Float L1 = 0.0; Float L2 = 0.0; Float Linf = 0.0; #ifdef PROFILE Float time = -dclock(); ForceFlops=0; #endif //Calculate v1, v2. Both must be in CANONICAL order afterwards { CgArg cg_arg ; cg_arg.mass = mass ; DiracOpDwf dwf(*this, v1, v2, &cg_arg, CNV_FRM_YES) ; Float kappa( 1.0 / ( 2 * (4 + GJP.DwfA5Inv() - GJP.DwfHeight()))); v2->CopyVec(phi,f_size_cb); // rescale the input field. As the second half of the this field // will be constructed by acting with the PC dslash on v1, this // rescales *one* of the full vectors - giving rise to an overall // rescaling of the final answer by exactly -\kappa^2 v2->VecTimesEquFloat(-kappa*kappa,f_size_cb); // only need one factor of -\kappa^2, so don't rescale the second // full vector (v2) v1->CopyVec(eta,f_size_cb); dwf.Dslash(v2+(f_size_cb/6), v2 , CHKB_ODD, DAG_YES); dwf.Dslash(v1+(f_size_cb/6), v1 , CHKB_ODD, DAG_NO); // v1 and v2 are now the vectors needed to contruct the force term // written in ( ODD, EVEN ) ordering. They will be converted back // into canonical ordering when the destructor is called. } // two fermion vectors at a single position // - these will be used to store off-node // field components char *str_site_v1 = "site_v1" ; Float *site_v1 = (Float *)smalloc(FsiteSize()*sizeof(Float)) ; if (site_v1 == 0) ERR.Pointer(cname, fname, str_site_v1) ; VRB.Smalloc(cname, fname, str_site_v1, site_v1, FsiteSize()*sizeof(Float)) ; char *str_site_v2 = "site_v2" ; Float *site_v2 = (Float *)smalloc(FsiteSize()*sizeof(Float)) ; if (site_v2 == 0) ERR.Pointer(cname, fname, str_site_v2) ; VRB.Smalloc(cname, fname, str_site_v2, site_v2, FsiteSize()*sizeof(Float)) ; // evolve the momenta by the fermion force int mu, x, y, z, t, s; const int lx(GJP.XnodeSites()); const int ly(GJP.YnodeSites()); const int lz(GJP.ZnodeSites()); const int lt(GJP.TnodeSites()); const int ls(GJP.SnodeSites()); // start by summing first over direction (mu) and then over site to // allow SCU transfers to happen face-by-face in the outermost loop. VRB.Clock(cname, fname, "Before loop over links.\n") ; for (mu=0; mu<4; mu++) { for (t=0; t<lt; t++){ for (z=0; z<lz; z++){ for (y=0; y<ly; y++){ for (x=0; x<lx; x++) { // position offset int gauge_offset = x+lx*(y+ly*(z+lz*t)); // offset for vector field at this point // (4d only, no fifth dimension) int vec_offset = f_site_size_4d*gauge_offset ; // offset for link in mu direction from this point gauge_offset = mu+4*gauge_offset ; Float *v1_plus_mu=NULL ; Float *v2_plus_mu=NULL ; int vec_plus_mu_stride=0 ; int vec_plus_mu_offset = f_site_size_4d ; // sign of coeff (look at momenta update) Float coeff = -2.0 * dt ; switch (mu) { case 0 : // next position in mu direction vec_plus_mu_offset *= (x+1)%lx+lx*(y+ly*(z+lz*t)) ; // vec_plus_mu_offset now the correct // offset for a fermion field at this point // in the lattice if ((x+1) == lx) { // off-node for (s=0; s<ls; s++) { // fill site_v1 and site_v2 with v1 and v2 data // from x=0 on next node, need loop because // data is not contiguous in memory getPlusData( (Float *)site_v1+s*f_site_size_4d, (Float *)v1+vec_plus_mu_offset+s*f_size_4d, f_site_size_4d, mu) ; getPlusData( (Float*)site_v2+s*f_site_size_4d, (Float*)v2+vec_plus_mu_offset+s*f_size_4d, f_site_size_4d, mu) ; } // end for s v1_plus_mu = site_v1 ; v2_plus_mu = site_v2 ; vec_plus_mu_stride = 0 ; // field now contiguous // GJP.XnodeBc() gives the forward boundary // condition only (so this should work). if (GJP.XnodeBc()==BND_CND_APRD) coeff = -coeff ; } else { // on - node // // just add offset to v1 and v2 // (they are now 1 forward in the mu direction ) // v1_plus_mu = (Float*)v1+vec_plus_mu_offset ; v2_plus_mu = (Float*)v2+vec_plus_mu_offset ; vec_plus_mu_stride = f_size_4d - f_site_size_4d ; // explained below } break ; // Repeat for the other directions case 1 : vec_plus_mu_offset *= x+lx*((y+1)%ly+ly*(z+lz*t)) ; if ((y+1) == ly) { for (s=0; s<ls; s++) { getPlusData( (Float*)site_v1+s*f_site_size_4d, (Float*)v1+vec_plus_mu_offset+s*f_size_4d, f_site_size_4d, mu) ; getPlusData( (Float*)site_v2+s*f_site_size_4d, (Float*)v2+vec_plus_mu_offset+s*f_size_4d, f_site_size_4d, mu) ; } v1_plus_mu = site_v1 ; v2_plus_mu = site_v2 ; vec_plus_mu_stride = 0 ; if (GJP.YnodeBc()==BND_CND_APRD) coeff = -coeff ; } else { v1_plus_mu = (Float *)v1+vec_plus_mu_offset ; v2_plus_mu = (Float *)v2+vec_plus_mu_offset ; vec_plus_mu_stride = f_size_4d - f_site_size_4d ; } break ; case 2 : vec_plus_mu_offset *= x+lx*(y+ly*((z+1)%lz+lz*t)) ; if ((z+1) == lz) { for (s=0; s<ls; s++) { getPlusData( (Float*)site_v1+s*f_site_size_4d, (Float*)v1+vec_plus_mu_offset+s*f_size_4d, f_site_size_4d, mu) ; getPlusData( (Float*)site_v2+s*f_site_size_4d, (Float*)v2+vec_plus_mu_offset+s*f_size_4d, f_site_size_4d, mu) ; } v1_plus_mu = site_v1 ; v2_plus_mu = site_v2 ; vec_plus_mu_stride = 0 ; if (GJP.ZnodeBc()==BND_CND_APRD) coeff = -coeff ; } else { v1_plus_mu = (Float *)v1+vec_plus_mu_offset ; v2_plus_mu = (Float *)v2+vec_plus_mu_offset ; vec_plus_mu_stride = f_size_4d - f_site_size_4d ; } break ; case 3 : vec_plus_mu_offset *= x+lx*(y+ly*(z+lz*((t+1)%lt))) ; if ((t+1) == lt) { for (s=0; s<ls; s++) { getPlusData( (Float*)site_v1+s*f_site_size_4d, (Float*)v1+vec_plus_mu_offset+s*f_size_4d, f_site_size_4d, mu) ; getPlusData( (Float*)site_v2+s*f_site_size_4d, (Float*)v2+vec_plus_mu_offset+s*f_size_4d, f_site_size_4d, mu) ; } v1_plus_mu = site_v1 ; v2_plus_mu = site_v2 ; vec_plus_mu_stride = 0 ; if (GJP.TnodeBc()==BND_CND_APRD) coeff = -coeff ; } else { v1_plus_mu = (Float *)v1+vec_plus_mu_offset ; v2_plus_mu = (Float *)v2+vec_plus_mu_offset ; vec_plus_mu_stride = f_size_4d - f_site_size_4d ; } } // end (the evil) mu switch Matrix tmp_mat1, tmp_mat2; // the non-zero stride pattern is due to domain wall // fermions ( summing up *ls* different sproj's ) // // f_size_4d-f_site_size_4d is the number of floats // between the end of one spinor at s and the start of the // spinor at s+1 // // vec_plus_mu_stride is the same, except when // this is off boundary, in that case the info // is copied into a contiguous block in the above code // and vec_plus_mu_stride set to zero // ( 1 - gamma_\mu ) Tr_s [ v1(x+\mu) v2^{\dagger}(x) ] sproj_tr[mu]( (Float *)&tmp_mat1, (Float *)v1_plus_mu, (Float *)v2+vec_offset, ls, vec_plus_mu_stride, f_size_4d-f_site_size_4d) ; // (1 + gamma_\mu) Tr_s [ v2(x+\mu) v1^{\dagger}(x) ] sproj_tr[mu+4]( (Float *)&tmp_mat2, (Float *)v2_plus_mu, (Float *)v1+vec_offset, ls, vec_plus_mu_stride, f_size_4d-f_site_size_4d) ; // exactly what is sounds like tmp_mat1 += tmp_mat2 ; if(GJP.Snodes() != 1) { for (s=0; s<(sizeof(Matrix)/sizeof(Float)); ++s) { glb_sum_dir((Float *)&tmp_mat1 + s, 4) ; } } // multiply sum by the link in the \mu direction tmp_mat2.DotMEqual(*(gauge+gauge_offset), tmp_mat1) ; // take tracless antihermitian piece // TrLessAntiHermMatrix need to be passed // the dagger of the matrix in question tmp_mat1.Dagger(tmp_mat2) ; tmp_mat2.TrLessAntiHermMatrix(tmp_mat1) ; tmp_mat2 *= coeff ; // note the minus sign. *(mom+gauge_offset) -= tmp_mat2 ; Float norm = tmp_mat2.norm(); Float tmp = sqrt(norm); L1 += tmp; L2 += norm; Linf = (tmp>Linf ? tmp : Linf); } // end for x } // end for y } // end for z } // end for t } // end for mu ForceFlops += (2*9*16*ls + 18+ 198+36+24)*lx*ly*lz*lt*4; #ifdef PROFILE time += dclock(); print_flops(cname,fname,ForceFlops,time); #endif // deallocate smalloc'd space VRB.Sfree(cname, fname, str_site_v2, site_v2) ; sfree(site_v2) ; VRB.Sfree(cname, fname, str_site_v1, site_v1) ; sfree(site_v1) ; VRB.Sfree(cname, fname, str_v2, v2) ; sfree(v2) ; VRB.Sfree(cname, fname, str_v1, v1) ; sfree(v1) ; glb_sum(&L1); glb_sum(&L2); glb_max(&Linf); L1 /= 4.0*GJP.VolSites(); L2 /= 4.0*GJP.VolSites(); VRB.FuncEnd(cname,fname); return ForceArg(L1, sqrt(L2), Linf); }
CPS_START_NAMESPACE /*!\file \brief Implementation of FdwfBase class. $Id: f_dwf_base_force.C,v 1.14 2012-08-31 04:55:08 chulwoo Exp $ */ //-------------------------------------------------------------------- // CVS keywords // // $Source: /home/chulwoo/CPS/repo/CVS/cps_only/cps_pp/src/util/lattice/f_dwf_base/noarch/f_dwf_base_force.C,v $ // $State: Exp $ // //-------------------------------------------------------------------- //------------------------------------------------------------------ // // f_dwf_base_force.C // // (R)HMC force term for FdwfBase // //------------------------------------------------------------------ CPS_END_NAMESPACE #include <util/qcdio.h> #include <math.h> #include <util/lattice.h> #include <util/dirac_op.h> #include <util/dwf.h> #include <util/gjp.h> #include <util/verbose.h> #include <util/vector.h> #include <util/random.h> #include <util/error.h> #include <util/time_cps.h> #include <comms/scu.h> // GRF #include <comms/glb.h> CPS_START_NAMESPACE #undef PROFILE // CJ: change start //------------------------------------------------------------------ // EvolveMomFforce(Matrix *mom, Vector *chi, Float mass, // Float dt): // It evolves the canonical momentum mom by dt // using the fermion force. //------------------------------------------------------------------ ForceArg FdwfBase::EvolveMomFforce(Matrix *mom, Vector *chi, Float mass, Float dt){ char *fname = "EvolveMomFforce(M*,V*,F,F,F)"; VRB.Func(cname,fname); Matrix *gauge = GaugeField() ; if (Colors() != 3) ERR.General(cname,fname,"Wrong nbr of colors.") ; if (SpinComponents() != 4) ERR.General(cname,fname,"Wrong nbr of spin comp.") ; if (mom == 0) ERR.Pointer(cname,fname,"mom") ; if (chi == 0) ERR.Pointer(cname,fname,"chi") ; //---------------------------------------------------------------- // allocate space for two CANONICAL fermion fields //---------------------------------------------------------------- int f_size = FsiteSize() * GJP.VolNodeSites() ; int f_site_size_4d = 2 * Colors() * SpinComponents(); int f_size_4d = f_site_size_4d * GJP.VolNodeSites() ; char *str_v1 = "v1" ; Vector *v1 = (Vector *)smalloc(f_size*sizeof(Float)) ; if (v1 == 0) ERR.Pointer(cname, fname, str_v1) ; VRB.Smalloc(cname, fname, str_v1, v1, f_size*sizeof(Float)) ; char *str_v2 = "v2" ; Vector *v2 = (Vector *)smalloc(f_size*sizeof(Float)) ; if (v2 == 0) ERR.Pointer(cname, fname, str_v2) ; VRB.Smalloc(cname, fname, str_v2, v2, f_size*sizeof(Float)) ; //---------------------------------------------------------------- // allocate buffer space for two fermion fields that are assoc // with only one 4-D site. //---------------------------------------------------------------- char *str_site_v1 = "site_v1" ; Float *site_v1 = (Float *)smalloc(FsiteSize()*sizeof(Float)) ; if (site_v1 == 0) ERR.Pointer(cname, fname, str_site_v1) ; VRB.Smalloc(cname, fname, str_site_v1, site_v1, FsiteSize()*sizeof(Float)) ; char *str_site_v2 = "site_v2" ; Float *site_v2 = (Float *)smalloc(FsiteSize()*sizeof(Float)) ; if (site_v2 == 0) ERR.Pointer(cname, fname, str_site_v2) ; VRB.Smalloc(cname, fname, str_site_v2, site_v2, FsiteSize()*sizeof(Float)) ; Float L1 = 0.0; Float L2 = 0.0; Float Linf = 0.0; //---------------------------------------------------------------- // Calculate v1, v2. Both v1, v2 must be in CANONICAL order after // the calculation. //---------------------------------------------------------------- VRB.Clock(cname, fname, "Before calc force vecs.\n") ; VRB.Flow(cname, fname, "Before calc force vecs.\n") ; { CgArg cg_arg ; cg_arg.mass = mass ; DiracOpDwf dwf(*this, v1, v2, &cg_arg, CNV_FRM_YES) ; dwf.CalcHmdForceVecs(chi) ; } VRB.Flow(cname, fname, "After calc force vecs.\n") ; #ifdef PROFILE Float time = -dclock(); ForceFlops=0; #endif int mu, x, y, z, t, s, lx, ly, lz, lt, ls ; lx = GJP.XnodeSites() ; ly = GJP.YnodeSites() ; lz = GJP.ZnodeSites() ; lt = GJP.TnodeSites() ; ls = GJP.SnodeSites() ; Matrix tmp_mat1, tmp_mat2 ; //------------------------------------------------------------------ // start by summing first over direction (mu) and then over site // to allow SCU transfers to happen face-by-face in the outermost // loop. //------------------------------------------------------------------ VRB.Clock(cname, fname, "Before loop over links.\n") ; for (mu=0; mu<4; mu++){ for (t=0; t<lt; t++){ for (z=0; z<lz; z++){ for (y=0; y<ly; y++){ for (x=0; x<lx; x++){ int gauge_offset = x+lx*(y+ly*(z+lz*t)) ; int vec_offset = f_site_size_4d*gauge_offset ; gauge_offset = mu+4*gauge_offset ; Float *v1_plus_mu ; Float *v2_plus_mu ; int vec_plus_mu_stride ; int vec_plus_mu_offset = f_site_size_4d ; Float coeff = -2.0 * dt ; switch (mu) { case 0 : vec_plus_mu_offset *= (x+1)%lx+lx*(y+ly*(z+lz*t)) ; if ((x+1) == lx) { for (s=0; s<ls; s++) { getPlusData( (IFloat *)site_v1+s*f_site_size_4d, (IFloat *)v1+vec_plus_mu_offset+s*f_size_4d, f_site_size_4d, mu) ; getPlusData( (IFloat *)site_v2+s*f_site_size_4d, (IFloat *)v2+vec_plus_mu_offset+s*f_size_4d, f_site_size_4d, mu) ; } // end for s v1_plus_mu = site_v1 ; v2_plus_mu = site_v2 ; vec_plus_mu_stride = 0 ; if (GJP.XnodeBc()==BND_CND_APRD) coeff = -coeff ; } else { v1_plus_mu = (Float *)v1+vec_plus_mu_offset ; v2_plus_mu = (Float *)v2+vec_plus_mu_offset ; vec_plus_mu_stride = f_size_4d - f_site_size_4d ; } break ; case 1 : vec_plus_mu_offset *= x+lx*((y+1)%ly+ly*(z+lz*t)) ; if ((y+1) == ly) { for (s=0; s<ls; s++) { getPlusData( (IFloat *)site_v1+s*f_site_size_4d, (IFloat *)v1+vec_plus_mu_offset+s*f_size_4d, f_site_size_4d, mu) ; getPlusData( (IFloat *)site_v2+s*f_site_size_4d, (IFloat *)v2+vec_plus_mu_offset+s*f_size_4d, f_site_size_4d, mu) ; } // end for s v1_plus_mu = site_v1 ; v2_plus_mu = site_v2 ; vec_plus_mu_stride = 0 ; if (GJP.YnodeBc()==BND_CND_APRD) coeff = -coeff ; } else { v1_plus_mu = (Float *)v1+vec_plus_mu_offset ; v2_plus_mu = (Float *)v2+vec_plus_mu_offset ; vec_plus_mu_stride = f_size_4d - f_site_size_4d ; } break ; case 2 : vec_plus_mu_offset *= x+lx*(y+ly*((z+1)%lz+lz*t)) ; if ((z+1) == lz) { for (s=0; s<ls; s++) { getPlusData( (IFloat *)site_v1+s*f_site_size_4d, (IFloat *)v1+vec_plus_mu_offset+s*f_size_4d, f_site_size_4d, mu) ; getPlusData( (IFloat *)site_v2+s*f_site_size_4d, (IFloat *)v2+vec_plus_mu_offset+s*f_size_4d, f_site_size_4d, mu) ; } // end for s v1_plus_mu = site_v1 ; v2_plus_mu = site_v2 ; vec_plus_mu_stride = 0 ; if (GJP.ZnodeBc()==BND_CND_APRD) coeff = -coeff ; } else { v1_plus_mu = (Float *)v1+vec_plus_mu_offset ; v2_plus_mu = (Float *)v2+vec_plus_mu_offset ; vec_plus_mu_stride = f_size_4d - f_site_size_4d ; } break ; case 3 : vec_plus_mu_offset *= x+lx*(y+ly*(z+lz*((t+1)%lt))) ; if ((t+1) == lt) { for (s=0; s<ls; s++) { getPlusData( (IFloat *)site_v1+s*f_site_size_4d, (IFloat *)v1+vec_plus_mu_offset+s*f_size_4d, f_site_size_4d, mu) ; getPlusData( (IFloat *)site_v2+s*f_site_size_4d, (IFloat *)v2+vec_plus_mu_offset+s*f_size_4d, f_site_size_4d, mu) ; } // end for s v1_plus_mu = site_v1 ; v2_plus_mu = site_v2 ; vec_plus_mu_stride = 0 ; if (GJP.TnodeBc()==BND_CND_APRD) coeff = -coeff ; } else { v1_plus_mu = (Float *)v1+vec_plus_mu_offset ; v2_plus_mu = (Float *)v2+vec_plus_mu_offset ; vec_plus_mu_stride = f_size_4d - f_site_size_4d ; } } // end switch mu sproj_tr[mu]( (IFloat *)&tmp_mat1, (IFloat *)v1_plus_mu, (IFloat *)v2+vec_offset, ls, vec_plus_mu_stride, f_size_4d-f_site_size_4d) ; sproj_tr[mu+4]( (IFloat *)&tmp_mat2, (IFloat *)v2_plus_mu, (IFloat *)v1+vec_offset, ls, vec_plus_mu_stride, f_size_4d-f_site_size_4d) ; tmp_mat1 += tmp_mat2 ; // If GJP.Snodes > 1 sum up contributions from all s nodes if(GJP.Snodes() > 1) { // if(!UniqueID())printf("%s::%s:GJP.Snodes()=%d\n",cname,fname,GJP.Snodes()); glb_sum_multi_dir((Float *)&tmp_mat1,4,sizeof(Matrix)/sizeof(IFloat)); } tmp_mat2.DotMEqual(*(gauge+gauge_offset), tmp_mat1) ; tmp_mat1.Dagger(tmp_mat2) ; tmp_mat2.TrLessAntiHermMatrix(tmp_mat1) ; tmp_mat2 *= coeff ; *(mom+gauge_offset) += tmp_mat2 ; Float norm = tmp_mat2.norm(); Float tmp = sqrt(norm); L1 += tmp; L2 += norm; Linf = (tmp>Linf ? tmp : Linf); } } } } // end for x,y,z,t } // end for mu ForceFlops += (2*9*16*ls + 18+ 198+36+24)*lx*ly*lz*lt*4; #ifdef PROFILE time += dclock(); print_flops(cname,fname,ForceFlops,time); #endif //------------------------------------------------------------------ // deallocate smalloc'd space //------------------------------------------------------------------ VRB.Sfree(cname, fname, str_site_v2, site_v2) ; sfree(site_v2) ; VRB.Sfree(cname, fname, str_site_v1, site_v1) ; sfree(site_v1) ; VRB.Sfree(cname, fname, str_v2, v2) ; sfree(v2) ; VRB.Sfree(cname, fname, str_v1, v1) ; sfree(v1) ; glb_sum(&L1); glb_sum(&L2); glb_max(&Linf); L1 /= 4.0*GJP.VolSites(); L2 /= 4.0*GJP.VolSites(); VRB.FuncEnd(cname,fname); return ForceArg(L1, sqrt(L2), Linf); }
//------------------------------------------------------------------ // EvolveMomFforce(Matrix *mom, Vector *frm, Float mass, // Float dt): // It evolves the canonical momentum mom by dt // using the fermion force. //------------------------------------------------------------------ ForceArg Fwilson::EvolveMomFforce(Matrix *mom, Vector *chi, Float mass, Float dt) { char *fname = "EvolveMomFforce(M*,V*,F,F,F)"; VRB.Func(cname,fname); Matrix *gauge = GaugeField() ; if (Colors() != 3) ERR.General(cname,fname,"Wrong nbr of colors.") ; if (SpinComponents() != 4) ERR.General(cname,fname,"Wrong nbr of spin comp.") ; if (mom == 0) ERR.Pointer(cname,fname,"mom") ; if (chi == 0) ERR.Pointer(cname,fname,"chi") ; //------------------------------------------------------------------ // allocate space for two CANONICAL fermion fields. //------------------------------------------------------------------ int f_size = FsiteSize() * GJP.VolNodeSites() ; char *str_v1 = "v1" ; Vector *v1 = (Vector *)smalloc(f_size*sizeof(Float)) ; if (v1 == 0) ERR.Pointer(cname, fname, str_v1) ; VRB.Smalloc(cname, fname, str_v1, v1, f_size*sizeof(Float)) ; char *str_v2 = "v2" ; Vector *v2 = (Vector *)smalloc(f_size*sizeof(Float)) ; if (v2 == 0) ERR.Pointer(cname, fname, str_v2) ; VRB.Smalloc(cname, fname, str_v2, v2, f_size*sizeof(Float)) ; //------------------------------------------------------------------ // allocate space for two CANONICAL fermion field on a site. //------------------------------------------------------------------ char *str_site_v1 = "site_v1"; Float *site_v1 = (Float *)smalloc(FsiteSize()*sizeof(Float)); if (site_v1 == 0) ERR.Pointer(cname, fname, str_site_v1) ; VRB.Smalloc(cname, fname, str_site_v1, site_v1, FsiteSize()*sizeof(Float)) ; char *str_site_v2 = "site_v2"; Float *site_v2 = (Float *)smalloc(FsiteSize()*sizeof(Float)); if (site_v2 == 0) ERR.Pointer(cname, fname, str_site_v2) ; VRB.Smalloc(cname, fname, str_site_v2, site_v2, FsiteSize()*sizeof(Float)) ; { CgArg cg_arg ; cg_arg.mass = mass ; DiracOpWilson wilson(*this, v1, v2, &cg_arg, CNV_FRM_YES) ; wilson.CalcHmdForceVecs(chi) ; } int x, y, z, t, lx, ly, lz, lt ; lx = GJP.XnodeSites() ; ly = GJP.YnodeSites() ; lz = GJP.ZnodeSites() ; lt = GJP.TnodeSites() ; //------------------------------------------------------------------ // start by summing first over direction (mu) and then over site // to allow SCU transfers to happen face-by-face in the outermost // loop. //------------------------------------------------------------------ int mu ; Matrix tmp, f ; Float L1 = 0.0; Float L2 = 0.0; Float Linf = 0.0; for (mu=0; mu<4; mu++) { for (t=0; t<lt; t++) for (z=0; z<lz; z++) for (y=0; y<ly; y++) for (x=0; x<lx; x++) { int gauge_offset = x+lx*(y+ly*(z+lz*t)) ; int vec_offset = FsiteSize()*gauge_offset ; gauge_offset = mu+4*gauge_offset ; Float *v1_plus_mu ; Float *v2_plus_mu ; int vec_plus_mu_offset = FsiteSize() ; Float coeff = -2.0 * dt ; switch (mu) { case 0 : vec_plus_mu_offset *= (x+1)%lx+lx*(y+ly*(z+lz*t)) ; if ((x+1) == lx) { getPlusData( (IFloat *)site_v1, (IFloat *)v1+vec_plus_mu_offset, FsiteSize(), mu) ; getPlusData( (IFloat *)site_v2, (IFloat *)v2+vec_plus_mu_offset, FsiteSize(), mu) ; v1_plus_mu = site_v1 ; v2_plus_mu = site_v2 ; if (GJP.XnodeBc()==BND_CND_APRD) coeff = -coeff ; } else { v1_plus_mu = (Float *)v1+vec_plus_mu_offset ; v2_plus_mu = (Float *)v2+vec_plus_mu_offset ; } break ; case 1 : vec_plus_mu_offset *= x+lx*((y+1)%ly+ly*(z+lz*t)) ; if ((y+1) == ly) { getPlusData( (IFloat *)site_v1, (IFloat *)v1+vec_plus_mu_offset, FsiteSize(), mu) ; getPlusData( (IFloat *)site_v2, (IFloat *)v2+vec_plus_mu_offset, FsiteSize(), mu) ; v1_plus_mu = site_v1 ; v2_plus_mu = site_v2 ; if (GJP.YnodeBc()==BND_CND_APRD) coeff = -coeff ; } else { v1_plus_mu = (Float *)v1+vec_plus_mu_offset ; v2_plus_mu = (Float *)v2+vec_plus_mu_offset ; } break ; case 2 : vec_plus_mu_offset *= x+lx*(y+ly*((z+1)%lz+lz*t)) ; if ((z+1) == lz) { getPlusData( (IFloat *)site_v1, (IFloat *)v1+vec_plus_mu_offset, FsiteSize(), mu) ; getPlusData( (IFloat *)site_v2, (IFloat *)v2+vec_plus_mu_offset, FsiteSize(), mu) ; v1_plus_mu = site_v1 ; v2_plus_mu = site_v2 ; if (GJP.ZnodeBc()==BND_CND_APRD) coeff = -coeff ; } else { v1_plus_mu = (Float *)v1+vec_plus_mu_offset ; v2_plus_mu = (Float *)v2+vec_plus_mu_offset ; } break ; case 3 : vec_plus_mu_offset *= x+lx*(y+ly*(z+lz*((t+1)%lt))) ; if ((t+1) == lt) { getPlusData( (IFloat *)site_v1, (IFloat *)v1+vec_plus_mu_offset, FsiteSize(), mu) ; getPlusData( (IFloat *)site_v2, (IFloat *)v2+vec_plus_mu_offset, FsiteSize(), mu) ; v1_plus_mu = site_v1 ; v2_plus_mu = site_v2 ; if (GJP.TnodeBc()==BND_CND_APRD) coeff = -coeff ; } else { v1_plus_mu = (Float *)v1+vec_plus_mu_offset ; v2_plus_mu = (Float *)v2+vec_plus_mu_offset ; } } // end switch mu sproj_tr[mu]( (IFloat *)&tmp, (IFloat *)v1_plus_mu, (IFloat *)v2+vec_offset, 1, 0, 0); sproj_tr[mu+4]( (IFloat *)&f, (IFloat *)v2_plus_mu, (IFloat *)v1+vec_offset, 1, 0, 0); tmp += f ; f.DotMEqual(*(gauge+gauge_offset), tmp) ; tmp.Dagger(f) ; f.TrLessAntiHermMatrix(tmp) ; f *= coeff ; *(mom+gauge_offset) += f ; Float norm = f.norm(); Float tmp = sqrt(norm); L1 += tmp; L2 += norm; Linf = (tmp>Linf ? tmp : Linf); } } //------------------------------------------------------------------ // deallocate space for two CANONICAL fermion fields on a site. //------------------------------------------------------------------ VRB.Sfree(cname, fname, str_site_v2, site_v2) ; sfree(site_v2) ; VRB.Sfree(cname, fname, str_site_v1, site_v1) ; sfree(site_v1) ; //------------------------------------------------------------------ // deallocate space for two CANONICAL fermion fields. //------------------------------------------------------------------ VRB.Sfree(cname, fname, str_v2, v2) ; sfree(v2) ; VRB.Sfree(cname, fname, str_v1, v1) ; sfree(v1) ; glb_sum(&L1); glb_sum(&L2); glb_max(&Linf); L1 /= 4.0*GJP.VolSites(); L2 /= 4.0*GJP.VolSites(); return ForceArg(L1, sqrt(L2), Linf); }
// CJ: change start //------------------------------------------------------------------ // EvolveMomFforce(Matrix *mom, Vector *chi, Float mass, // Float dt): // It evolves the canonical momentum mom by dt // using the fermion force. //------------------------------------------------------------------ ForceArg FdwfBase::EvolveMomFforce(Matrix *mom, Vector *chi, Float mass, Float dt){ char *fname = "EvolveMomFforce(M*,V*,F,F,F)"; VRB.Func(cname,fname); Matrix *gauge = GaugeField() ; if (Colors() != 3) ERR.General(cname,fname,"Wrong nbr of colors.") ; if (SpinComponents() != 4) ERR.General(cname,fname,"Wrong nbr of spin comp.") ; if (mom == 0) ERR.Pointer(cname,fname,"mom") ; if (chi == 0) ERR.Pointer(cname,fname,"chi") ; //---------------------------------------------------------------- // allocate space for two CANONICAL fermion fields //---------------------------------------------------------------- int f_size = FsiteSize() * GJP.VolNodeSites() ; int f_site_size_4d = 2 * Colors() * SpinComponents(); int f_size_4d = f_site_size_4d * GJP.VolNodeSites() ; char *str_v1 = "v1" ; Vector *v1 = (Vector *)fmalloc(cname,fname,str_v1,f_size*sizeof(Float)); // if (v1 == 0) ERR.Pointer(cname, fname, str_v1) ; // VRB.Smalloc(cname, fname, str_v1, v1, f_size*sizeof(Float)) ; char *str_v2 = "v2" ; Vector *v2 = (Vector *)fmalloc(cname,fname,str_v2,f_size*sizeof(Float)) ; // if (v2 == 0) ERR.Pointer(cname, fname, str_v2) ; // VRB.Smalloc(cname, fname, str_v2, v2, f_size*sizeof(Float)) ; // LatMatrix MomDiff(QFAST,4); // Matrix *mom_diff = MomDiff.Mat(); Float L1 = 0.0; Float L2 = 0.0; Float Linf = 0.0; //---------------------------------------------------------------- // Calculate v1, v2. Both v1, v2 must be in CANONICAL order after // the calculation. //---------------------------------------------------------------- VRB.Clock(cname, fname, "Before calc force vecs.\n") ; #ifdef PROFILE Float time = -dclock(); ForceFlops=0; #endif { CgArg cg_arg ; cg_arg.mass = mass ; DiracOpDwf dwf(*this, v1, v2, &cg_arg, CNV_FRM_NO) ; dwf.CalcHmdForceVecs(chi) ; Fconvert(v1,CANONICAL,WILSON); Fconvert(v2,CANONICAL,WILSON); } #ifdef PROFILE time += dclock(); print_flops(cname,fname,ForceFlops,time); #endif int mu, x, y, z, t, s, lx, ly, lz, lt, ls ; int size[5],surf[4],blklen[4],stride[4],numblk[4]; size[0] = lx = GJP.XnodeSites() ; size[1] = ly = GJP.YnodeSites() ; size[2] = lz = GJP.ZnodeSites() ; size[3] = lt = GJP.TnodeSites() ; size[4] = ls = GJP.SnodeSites() ; blklen[0] = sizeof(Float)*FsiteSize()/size[4]; numblk[0] = GJP.VolNodeSites()/size[0]*size[4]; stride[0] = blklen[0] * (size[0]-1); for (int i =1;i<4;i++){ blklen[i] = blklen[i-1] * size[i-1]; numblk[i] = numblk[i-1] / size[i]; stride[i] = blklen[i] * (size[i]-1); } for (int i =0;i<4;i++){ surf[i] = GJP.VolNodeSites()/size[i]; } //---------------------------------------------------------------- // allocate buffer space for two fermion fields that are assoc // with only one 4-D site. //---------------------------------------------------------------- unsigned long flag = QFAST; if (ls<4) flag|= QNONCACHE; char *str_site_v1 = "site_v1" ; char *str_site_v2 = "site_v2" ; Float *v1_buf[4],*v2_buf[4]; int pos[4]; Float *v1_buf_pos[4]; Float *v2_buf_pos[4]; for(int i =0;i<4;i++) { v1_buf[i]=(Float *)fmalloc(cname,fname,"v1_buf",surf[i]*FsiteSize()*sizeof(Float)) ; v2_buf[i]=(Float *)fmalloc(cname,fname,"v2_buf",surf[i]*FsiteSize()*sizeof(Float)) ; } // Matrix tmp_mat1, tmp_mat2 ; Matrix *tmp_mat1,*tmp_mat2; tmp_mat1 = (Matrix *)fmalloc(cname,fname,"tmp_mat1",sizeof(Matrix)*2); tmp_mat2 = tmp_mat1+1; SCUDirArgIR Send[4]; SCUDirArgIR Recv[4]; SCUDMAInst *dma[2]; for(int i = 0;i<2;i++) dma[i] = new SCUDMAInst; void *addr[2]; int f_bytes = sizeof(Float)*f_site_size_4d; int st_bytes = sizeof(Float)*f_size_4d - f_bytes; for (mu=0; mu<4; mu++){ if ( GJP.Nodes(mu) >1){ dma[0]->Init(v1_buf[mu],surf[mu]*ls*f_bytes); dma[1]->Init(v2_buf[mu],surf[mu]*ls*f_bytes); Recv[mu].Init(gjp_scu_dir[2*mu],SCU_REC,dma,2); dma[0]->Init(v1,blklen[mu],numblk[mu],stride[mu]); dma[1]->Init(v2,blklen[mu],numblk[mu],stride[mu]); Send[mu].Init(gjp_scu_dir[2*mu+1],SCU_SEND,dma,2); } } for(int i = 0;i<2;i++) delete dma[i]; VRB.Clock(cname, fname, "Before loop over links.\n") ; #ifdef PROFILE time = -dclock(); ForceFlops=0; #endif sys_cacheflush(0); for (mu=0; mu<4; mu++){ if ( GJP.Nodes(mu) >1){ Recv[mu].StartTrans(); Send[mu].StartTrans(); } } for (mu=0; mu<4; mu++){ for (pos[3]=0; pos[3]<size[3]; pos[3]++){ for (pos[2]=0; pos[2]<size[2]; pos[2]++){ for (pos[1]=0; pos[1]<size[1]; pos[1]++){ for (pos[0]=0; pos[0]<size[0]; pos[0]++){ int gauge_offset = offset(size,pos); int vec_offset = f_site_size_4d*gauge_offset ; gauge_offset = mu+4*gauge_offset ; // printf("%d %d %d %d %d\n",mu,pos[0],pos[1],pos[2],pos[3]); Float *v1_plus_mu ; Float *v2_plus_mu ; int vec_plus_mu_stride ; int vec_plus_mu_offset = f_site_size_4d ; Float coeff = -2.0 * dt ; vec_plus_mu_offset *= offset(size,pos,mu); if ((GJP.Nodes(mu)>1)&&((pos[mu]+1) == size[mu]) ) { } else { v1_plus_mu = (Float *)v1+vec_plus_mu_offset ; v2_plus_mu = (Float *)v2+vec_plus_mu_offset ; vec_plus_mu_stride = f_size_4d - f_site_size_4d ; if ((pos[mu]+1) == size[mu]) if (GJP.NodeBc(mu)==BND_CND_APRD) coeff = -coeff ; // Float time2 = -dclock(); sproj_tr[mu]( (IFloat *)tmp_mat1, (IFloat *)v1_plus_mu, (IFloat *)v2+vec_offset, ls, vec_plus_mu_stride, f_size_4d-f_site_size_4d) ; sproj_tr[mu+4]( (IFloat *)tmp_mat2, (IFloat *)v2_plus_mu, (IFloat *)v1+vec_offset, ls, vec_plus_mu_stride, f_size_4d-f_site_size_4d) ; // time2 += dclock(); // print_flops("","sproj",2*9*16*ls,time2); *tmp_mat1 += *tmp_mat2 ; // If GJP.Snodes > 1 sum up contributions from all s nodes if(GJP.Snodes() > 1) { glb_sum_multi_dir((Float *)tmp_mat1, 4, sizeof(Matrix)/sizeof(IFloat) ) ; } tmp_mat2->DotMEqual(*(gauge+gauge_offset), *tmp_mat1) ; tmp_mat1->Dagger(*tmp_mat2) ; tmp_mat2->TrLessAntiHermMatrix(*tmp_mat1) ; *tmp_mat2 *= coeff ; *(mom+gauge_offset) += *tmp_mat2 ; Float norm = tmp_mat2->norm(); Float tmp = sqrt(norm); L1 += tmp; L2 += norm; Linf = (tmp>Linf ? tmp : Linf); } } } } } // end for x,y,z,t } // end for mu for (mu=0; mu<4; mu++){ if ( GJP.Nodes(mu) >1){ Recv[mu].TransComplete(); Send[mu].TransComplete(); } } //------------------------------------------------------------------ // start by summing first over direction (mu) and then over site // to allow SCU transfers to happen face-by-face in the outermost // loop. //------------------------------------------------------------------ for(int i = 0;i<4;i++){ v1_buf_pos[i] = v1_buf[i]; v2_buf_pos[i] = v2_buf[i]; } for (mu=0; mu<4; mu++){ for (pos[3]=0; pos[3]<size[3]; pos[3]++){ for (pos[2]=0; pos[2]<size[2]; pos[2]++){ for (pos[1]=0; pos[1]<size[1]; pos[1]++){ for (pos[0]=0; pos[0]<size[0]; pos[0]++){ int gauge_offset = offset(size,pos); int vec_offset = f_site_size_4d*gauge_offset ; gauge_offset = mu+4*gauge_offset ; Float *v1_plus_mu ; Float *v2_plus_mu ; int vec_plus_mu_stride ; int vec_plus_mu_offset = f_site_size_4d ; Float coeff = -2.0 * dt ; // printf("%d %d %d %d %d\n",mu,pos[0],pos[1],pos[2],pos[3]); vec_plus_mu_offset *= offset(size,pos,mu); if ((GJP.Nodes(mu)>1)&&((pos[mu]+1) == size[mu]) ) { v1_plus_mu = v1_buf_pos[mu] ; v2_plus_mu = v2_buf_pos[mu] ; v1_buf_pos[mu] += f_site_size_4d; v2_buf_pos[mu] += f_site_size_4d; vec_plus_mu_stride = (surf[mu] -1)*f_site_size_4d ; if (GJP.NodeBc(mu)==BND_CND_APRD) coeff = -coeff ; // Float time2 = -dclock(); sproj_tr[mu]( (IFloat *)tmp_mat1, (IFloat *)v1_plus_mu, (IFloat *)v2+vec_offset, ls, vec_plus_mu_stride, f_size_4d-f_site_size_4d) ; sproj_tr[mu+4]( (IFloat *)tmp_mat2, (IFloat *)v2_plus_mu, (IFloat *)v1+vec_offset, ls, vec_plus_mu_stride, f_size_4d-f_site_size_4d) ; // time2 += dclock(); // print_flops("","sproj",2*9*16*ls,time2); *tmp_mat1 += *tmp_mat2 ; // If GJP.Snodes > 1 sum up contributions from all s nodes if(GJP.Snodes() >1 ) { glb_sum_multi_dir((Float *)tmp_mat1, 4, sizeof(Matrix)/sizeof(IFloat) ) ; } tmp_mat2->DotMEqual(*(gauge+gauge_offset), *tmp_mat1) ; tmp_mat1->Dagger(*tmp_mat2) ; tmp_mat2->TrLessAntiHermMatrix(*tmp_mat1) ; *tmp_mat2 *= coeff ; *(mom+gauge_offset) += *tmp_mat2 ; Float norm = tmp_mat2->norm(); Float tmp = sqrt(norm); L1 += tmp; L2 += norm; Linf = (tmp>Linf ? tmp : Linf); } } } } } // end for x,y,z,t } // end for mu ForceFlops += (2*9*16*ls + 18+ 198+36+24)*lx*ly*lz*lt*4; #ifdef PROFILE time += dclock(); print_flops(cname,fname,ForceFlops,time); #endif //------------------------------------------------------------------ // deallocate smalloc'd space //------------------------------------------------------------------ for(int i =0;i<4;i++) { ffree(v1_buf[i],cname,fname,"v1_buf"); ffree(v2_buf[i],cname,fname,"v2_buf"); } VRB.Sfree(cname, fname, str_v2, v2) ; ffree(v2) ; VRB.Sfree(cname, fname, str_v1, v1) ; ffree(v1) ; ffree(tmp_mat1); glb_sum(&L1); glb_sum(&L2); glb_max(&Linf); L1 /= 4.0*GJP.VolSites(); L2 /= 4.0*GJP.VolSites(); return ForceArg(L1, sqrt(L2), Linf); }