CPS_START_NAMESPACE /*!\file \brief Definitions of functions that perform operations on complex matrices and vectors. $Id: vector_util.C,v 1.10 2013-04-19 20:25:52 chulwoo Exp $ */ //-------------------------------------------------------------------- // CVS keywords // // $Author: chulwoo $ // $Date: 2013-04-19 20:25:52 $ // $Header: /home/chulwoo/CPS/repo/CVS/cps_only/cps_pp/src/util/vector/comsrc/vector_util.C,v 1.10 2013-04-19 20:25:52 chulwoo Exp $ // $Id: vector_util.C,v 1.10 2013-04-19 20:25:52 chulwoo Exp $ // $Name: not supported by cvs2svn $ // $Locker: $ // $Revision: 1.10 $ // $Source: /home/chulwoo/CPS/repo/CVS/cps_only/cps_pp/src/util/vector/comsrc/vector_util.C,v $ // $State: Exp $ // //-------------------------------------------------------------------- /*------------------------------------------------------------------*/ /* For these functions there exists optimized assembly code. */ /*------------------------------------------------------------------*/ CPS_END_NAMESPACE #include <string.h> /* memcpy */ #include <util/vector.h> #include <util/time_cps.h> //#include<omp.h> CPS_START_NAMESPACE /*! \param b The vector to be copied to \param a The vector to be copied from. \param len The number of bytes to be copied. The arrays \a c and \a b must not alias each other. */ //---------------------------------------------------------------// void moveMem(void *b, const void *a, int len) { #undef PROFILE #ifdef PROFILE double time = -dclock(); #endif memcpy(b, a, len); #ifdef PROFILE time += dclock(); print_flops("","moveMem",len,time); #endif }
//Parallel transport of a vector through one hop void PT::vec(int n, IFloat **vout, IFloat **vin, const int *dir){ int i; static int call_num=0; SCUDirArgIR *SCUarg_p[2*n]; call_num++; //for(int s = 0; s < GJP.VolNodeSites(); s++) // { // for(int t = 0; t < 4; t++) // { // printf("site = %d, direction = %d\n",s,t); // for(int u = 0; u < 9; u++) // printf("%e %e\n",*(gauge_field_addr+4*GAUGE_LEN*s + GAUGE_LEN*t + 2*u),*(gauge_field_addr+4*GAUGE_LEN*s + GAUGE_LEN*t + 2*u+1)); // } // } #ifdef PROFILE Float dtime = - dclock(); #endif int wire[n]; SCUDirArgMulti SCUmulti; char *fname="pt_1vec"; // VRB.Func("",fname); int non_local_dir=0; for(i=0;i<n;i++) wire[i] = dir[i]; // from (x,y,z,t) to (t,x,y,z) // for(i=0;i<n;i++) printf("wire[%d]=%d\n",i,dir[i]); for(i=0;i<n;i++) if (!local[wire[i]/2]){ IFloat * addr = (vin[i]+VECT_LEN*offset[wire[i]]); SCUarg_p[2*non_local_dir] = SCUarg[0][2*wire[i]]; SCUarg_p[2*non_local_dir+1] = SCUarg[0][2*wire[i]+1]; SCUarg_p[2*non_local_dir+1]->Addr((void *)addr); non_local_dir++; } if(non_local_dir){ SCUmulti.Init(SCUarg_p,non_local_dir*2); SCUmulti.SlowStartTrans(); } for(i=0;i<n;i++) partrans_cmv_agg(local_chi[wire[i]],(long)uc_l[wire[i]], (long)vin[i],(long)vout[i]); if(non_local_dir){ SCUmulti.TransComplete(); } for(i=0;i<n;i++) partrans_cmv_agg(non_local_chi[wire[i]],(long)uc_nl[wire[i]], (long)rcv_buf[wire[i]],(long)vout[i]); #ifdef PROFILE dtime +=dclock(); print_flops("",fname,66*n*vol,dtime); #endif Flops +=66*n*vol; }
void moveVec(Float *b, const Float *a, int len) { #undef PROFILE #ifdef PROFILE double time = -dclock(); #endif // for(int i =0;i<len*6;i++) *b++ = *a++; memcpy(b, a, len*sizeof(Vector)); #ifdef PROFILE time += dclock(); print_flops("","moveVec",len*sizeof(Float),time); #endif }
ForceArg GimprRect::EvolveMomGforce(Matrix *mom, Float dt){ char *fname = "EvolveMomGforce(M*,F)"; VRB.Func(cname,fname); Float L1=0.0; Float L2=0.0; Float Linf=0.0; #ifdef PROFILE Float time = -dclock(); ForceFlops = 0; #endif setCbufCntrlReg(4, CBUF_MODE4); int x[4]; for(x[0] = 0; x[0] < GJP.XnodeSites(); ++x[0]) for(x[1] = 0; x[1] < GJP.YnodeSites(); ++x[1]) for(x[2] = 0; x[2] < GJP.ZnodeSites(); ++x[2]) for(x[3] = 0; x[3] < GJP.TnodeSites(); ++x[3]) { int uoff = GsiteOffset(x); for (int mu = 0; mu < 4; ++mu) { GforceSite(*mp0, x, mu); IFloat *ihp = (IFloat *)(mom+uoff+mu); IFloat *dotp = (IFloat *)mp0; fTimesV1PlusV2(ihp, dt, dotp, ihp, 18); Float norm = ((Matrix*)dotp)->norm(); Float tmp = sqrt(norm); L1 += tmp; L2 += norm; Linf = (tmp>Linf ? tmp : Linf); } } ForceFlops +=GJP.VolNodeSites()*4*18*2; #ifdef PROFILE time += dclock(); print_flops(cname,fname,ForceFlops,time); #endif glb_sum(&L1); glb_sum(&L2); glb_max(&Linf); L1 /= 4.0*GJP.VolSites(); L2 /= 4.0*GJP.VolSites(); VRB.FuncEnd(cname,fname); return ForceArg(dt*L1, dt*sqrt(L2), dt*Linf); }
//!< Calculate gauge contribution to the Hamiltonian Float AlgMomentum::energy() { Float dtime = -dclock(); const char *fname = "energy()"; Lattice &lat = LatticeFactory::Create(F_CLASS_NONE, G_CLASS_NONE); Float h = lat.MomHamiltonNode(mom); LatticeFactory::Destroy(); dtime += dclock(); print_flops(cname, fname, 0, dtime); return h; }
//!< evolve method evolves the gauge field due to the momentum void AlgMomentum::evolve(Float dt, int steps) { const char *fname = "evolve()"; Float dtime = -dclock(); Lattice &lat = LatticeFactory::Create(F_CLASS_NONE, G_CLASS_NONE); for (int i=0; i<steps; i++) lat.EvolveGfield(mom, dt); lat.MdTimeInc(dt*steps); VRB.Flow(cname,fname,"%s%f\n", md_time_str, IFloat(lat.MdTime())); LatticeFactory::Destroy(); dtime += dclock(); print_flops(cname, fname, 1968. * 4. * GJP.VolNodeSites() * steps, dtime); }
void moveFloat(Float *b, const Float *a, int len) { #undef PROFILE #ifdef PROFILE double time = -dclock(); #endif #ifdef USE_OMP #pragma omp parallel for for(int i =0;i<len;i++) b[i] = a[i]; #else memcpy(b, a, len*sizeof(Float)); #endif #ifdef PROFILE time += dclock(); print_flops("","moveFloat",len*sizeof(Float),time); #endif }
//!< Heat Bath for the conjugate momentum void AlgMomentum::heatbath() { const char *fname = "heatbath()"; Float dtime = -dclock(); Lattice &lat = LatticeFactory::Create(F_CLASS_NONE, G_CLASS_NONE); lat.RandGaussAntiHermMatrix(mom, 1.0); //!< reset MD time in Lattice (a momentum refresh means a new trajectory) lat.MdTime(0.0); VRB.Flow(cname,fname,"%s%f\n", md_time_str, IFloat(lat.MdTime())); LatticeFactory::Destroy(); dtime += dclock(); print_flops(cname, fname, 0, dtime); }
// The outer product of v and w, multiplied by the parity factor and // coefficient, and added to the force f. // N.B. The force term is multiplied by -1 on ODD parity sites. This the // the MILC convention and is here to test against the MILC code. // This should eventually be changed to EVEN for the CPS. void Fasqtad::force_product_sum(const Vector *v, const Vector *w, IFloat coeff, Matrix *f){ static int vol = 0; char *fname = "force_product_sum(*V,*V,F,*M)"; if (vol==0) vol = GJP.XnodeSites() * GJP.YnodeSites() * GJP.ZnodeSites() * GJP.TnodeSites() ; ForceFlops +=78*vol; unsigned long v2 = (unsigned long)v; if( qalloc_is_fast((Vector *)v) && qalloc_is_fast((Vector *)w) && qalloc_is_fast((Matrix *)f) ) v2 = v2 - 0xb0000000 + 0x9c000000; #ifdef PROFILE Float dtime = -dclock(); #endif IFloat coeff2 = 2.0*coeff; Force_cross2dag((Vector *)v2, w, f, vol/2, &coeff2); #ifdef PROFILE dtime += dclock(); print_flops(cname,fname,78*vol,dtime); #endif #if 0 {IFloat *tmp = (IFloat *)f; printf("result[0]="); for(int i = 0;i<18;i++){ printf(" %0.8e",*(tmp++)); if(i%6==5) printf("\n"); } tmp = (IFloat *)&(f[vol-1]); printf("result[%d]=",vol-1); for(int i = 0;i<18;i++){ printf(" %0.8e",*(tmp++)); if(i%6==5) printf("\n"); } } exit(54); #endif }
int main(int argc, char **argv) { int n; //problem size double *a, *b, *c; int mem_size; int i, j, k; int ii, jj, kk; double para_time_result; int block_size = 16; int num_thread = 0; n = 500; /* default problem size */ double flops; /* Default values if nothing is given by command prompt */ if(argc > 1){ n = atoi(argv[1]); } if(argc > 2){ block_size = atoi(argv[2]); } /* Dynamic memory allocation */ mem_size = n * n * sizeof(double); a = (double*)malloc(mem_size); b = (double*)malloc(mem_size); c = (double*)malloc(mem_size); if(0 == a || 0 == b || 0 == c){ printf("memory allocation failed"); return 0; } /* initialisation */ for (i = 0; i < n; i++){ for (j = 0; j < n; j++){ *(a + i * n + j) = (double)i + (double)j; *(b + i * n + j) = (double)(n - i) + (double)(n - j); } } memset(c, 0, mem_size); /* Parallelizing matrix multiplication with OpenMP where each cache-block is handled by only one thread starts here*/ time_marker_t time = get_time(); #pragma omp parallel default(none) shared(block_size, n, a, b, c) private(i, j, k, ii, jj, kk) reduction(+:num_thread) { num_thread += 1; #pragma omp for schedule(dynamic) for(i = 0; i < n; i += block_size){ for(j = 0; j < n; j += block_size){ for(k = 0; k < n; k += block_size){ for(ii = i; ii < min(i + block_size, n); ii++) { for(jj = j; jj < min(j + block_size, n); jj++) { for(kk = k; kk < min(k + block_size, n); kk++) { c[ii * n + jj] += a[ii * n + kk] * b[kk * n + jj]; } } } } } } #pragma omp barrier } flops = 2.0 * n * n * n; para_time_result = print_flops(flops, time); printf("\nOpenMP: Problem size = %d, cache block size = %d, num_threads = %d, time=%g\n", n, block_size, num_thread, para_time_result); printf("****************************************************************************************\n"); /* matrix multiplication with OpenMP ends here */ return(0); }
void PT::mat_cb_norm(int n, IFloat **mout, IFloat **min, const int *dir, int parity, IFloat * gauge) { //List of the different directions int wire[MAX_DIR]; int i; // printf("PT::mat_cb_norm\n"); QMP_msgmem_t *msg_mem_p = (QMP_msgmem_t *)Alloc("","vec_cb_norm", "msg_mem_p", 2*non_local_dirs*sizeof(QMP_msgmem_t)); QMP_msghandle_t* msg_handle_p = (QMP_msghandle_t *)Alloc("","vec_cb_norm", "msg_handle_p", 2*non_local_dirs*sizeof(QMP_msghandle_t)); QMP_msghandle_t multiple; static int call_num = 0; int vlen = VECT_LEN; int vlen2 = VECT_LEN; call_num++; //Name our function char *fname="pt_mat_cb()"; // VRB.Func("",fname); //Set the transfer directions //If wire[i] is even, then we have communication in the negative direction //If wire[i] is odd, then we have communication in the positive direction for(i=0;i<n;i++) wire[i]=dir[i]; #ifdef PROFILE Float dtime = - dclock(); #endif int non_local_dir=0; //#pragma omp parallel default(shared) { //If wire[i] is odd, then we have parallel transport in the //positive direction. In this case, multiplication by the link matrix is //done before the field is transferred over to the adjacent node // //If we have transfer in the negative T direction (wire[i] = 6), then //we have to copy the appropriate fields to a send buffer //#pragma omp for for(i=0;i<n;i++) { if(!local[wire[i]/2]) { if(wire[i]%2) { if(conjugated) pt_cmm_cpp(non_local_chi_cb[wire[i]],(long)uc_nl_cb_pre[parity][wire[i]/2],(long)min[i],(long)snd_buf_cb[wire[i]/2],(long)gauge); else pt_cmm_dag_cpp(non_local_chi_cb[wire[i]],(long)uc_nl_cb_pre[parity][wire[i]/2],(long)min[i],(long)snd_buf_cb[wire[i]/2],(long)gauge); } else if((wire[i] == 6)) { for(int j = 0; j < non_local_chi_cb[6];j++) memcpy(snd_buf_t_cb + j*GAUGE_LEN,min[i] + 3 * *(Toffset[parity]+j)*3,GAUGE_LEN*sizeof(IFloat)); } } } //#pragma omp barrier //#pragma omp master { for(i=0;i<n;i++) if(!local[wire[i]/2]) { //Calculate the starting address for the data to be sent IFloat *addr = min[i] + GAUGE_LEN * offset_cb[wire[i]]; msg_mem_p[2*non_local_dir] = QMP_declare_msgmem((void *)rcv_buf[wire[i]], 3*non_local_chi_cb[wire[i]]*VECT_LEN*sizeof(IFloat)); //Initialize the msg_mem for sends if(wire[i]%2) msg_mem_p[2*non_local_dir+1] = QMP_declare_msgmem((void *)snd_buf_cb[wire[i]/2], 3*non_local_chi_cb[wire[i]]*VECT_LEN*sizeof(IFloat)); else if(wire[i] == 6) msg_mem_p[2*non_local_dir+1] = QMP_declare_msgmem((void *)snd_buf_t_cb, 3*non_local_chi_cb[wire[i]]*VECT_LEN*sizeof(IFloat)); else msg_mem_p[2*non_local_dir+1] = QMP_declare_strided_msgmem((void *)addr, (size_t)(3*blklen_cb[wire[i]]), numblk_cb[wire[i]], (ptrdiff_t)(3*stride_cb[wire[i]]+3*blklen_cb[wire[i]])); msg_handle_p[2*non_local_dir] = QMP_declare_receive_relative(msg_mem_p[2*non_local_dir], wire[i]/2, 1-2*(wire[i]%2), 0); msg_handle_p[2*non_local_dir+1] = QMP_declare_send_relative(msg_mem_p[2*non_local_dir+1], wire[i]/2, 2*(wire[i]%2)-1, 0); non_local_dir++; } if(non_local_dir) { multiple = QMP_declare_multiple(msg_handle_p, 2*non_local_dir); QMP_start(multiple); } } //#pragma omp master { //Do local calculations //#pragma omp for for(i=0;i<n;i++) { if((wire[i]%2 && conjugated) || ((wire[i]%2 == 0) && (conjugated == 0))) pt_cmm_cpp(local_chi_cb[wire[i]],(long)uc_l_cb[parity][wire[i]],(long)min[i],(long)mout[i],(long)gauge); else pt_cmm_dag_cpp(local_chi_cb[wire[i]],(long)uc_l_cb[parity][wire[i]],(long)min[i],(long)mout[i],(long)gauge); } //#pragma omp barrier //#pragma omp master { if(non_local_dir) { QMP_status_t qmp_complete_status = QMP_wait(multiple); if (qmp_complete_status != QMP_SUCCESS) QMP_error("Send failed in vec_cb_norm: %s\n", QMP_error_string(qmp_complete_status)); QMP_free_msghandle(multiple); for(int i = 0; i < 2*non_local_dir; i++) QMP_free_msgmem(msg_mem_p[i]); Free(msg_handle_p); Free(msg_mem_p); } } //#pragma omp master { //If wire[i] is even, then we have transport in the negative direction //In this case, the vector field is multiplied by the SU(3) link matrix //after all communication is complete IFloat *fp0,*fp1; //#pragma omp for for(i=0;i<n;i++) { if(!local[wire[i]/2]) { if(!(wire[i]%2)) { if(conjugated) pt_cmm_dag_cpp(non_local_chi_cb[wire[i]],(long)uc_nl_cb[parity][wire[i]],(long)rcv_buf[wire[i]],(long)mout[i],(long)gauge); else pt_cmm_cpp(non_local_chi_cb[wire[i]],(long)uc_nl_cb[parity][wire[i]],(long)rcv_buf[wire[i]],(long)mout[i],(long)gauge); } //Otherwise we have parallel transport in the positive direction. //In this case, the received data has already been pre-multiplied //All we need to do is to put the transported field in the correct place else { //int destination, source; //Place the data in the receive buffer into the result vector for(int s=0;s<non_local_chi_cb[wire[i]];s++) { //source = uc_nl_cb[parity][wire[i]][s].src; fp0 = (IFloat *)((long)rcv_buf[wire[i]]+3*uc_nl_cb[parity][wire[i]][s].src); //destination = uc_nl_cb[parity][wire[i]][s].dest; fp1 = (IFloat *)(mout[i]+3*uc_nl_cb[parity][wire[i]][s].dest); memcpy(fp1,fp0,GAUGE_LEN*sizeof(IFloat)); } } } } } //#pragma omp parallel #ifdef PROFILE dtime +=dclock(); print_flops("",fname,99*vol*n,dtime); #endif // ParTrans::PTflops +=99*n*vol; }
void PT::mat(int n, matrix **mout, matrix **min, const int *dir){ int wire[MAX_DIR]; int i; QMP_msgmem_t msg_mem_p[2*MAX_DIR]; QMP_msghandle_t msg_handle_p[2*MAX_DIR]; QMP_msghandle_t multiple; static double setup=0.,qmp=0.,localt=0.,nonlocal=0.; static int call_num = 0; call_num++; // char *fname="pt_mat()"; // VRB.Func("",fname); // if (call_num%100==1) printf("PT:mat()\n"); for(i=0;i<n;i++) wire[i] = dir[i]; #ifdef PROFILE Float dtime2 = - dclock(); #endif double dtime = -dclock(); int non_local_dir=0; for(i=0;i<n;i++) if (!local[wire[i]/2]) { //Calculate the address for transfer in a particular direction Float * addr = ((Float *)min[i]+GAUGE_LEN*offset[wire[i]]); msg_mem_p[2*non_local_dir] = QMP_declare_msgmem((void *)rcv_buf[wire[i]], 3*non_local_chi[wire[i]]*VECT_LEN*sizeof(IFloat)); msg_mem_p[2*non_local_dir+1] = QMP_declare_strided_msgmem((void *)addr, (size_t)(3*blklen[wire[i]]), numblk[wire[i]], (ptrdiff_t)(3*stride[wire[i]]+3*blklen[wire[i]])); msg_handle_p[2*non_local_dir] = QMP_declare_receive_relative(msg_mem_p[2*non_local_dir], wire[i]/2, 1-2*(wire[i]%2), 0); msg_handle_p[2*non_local_dir+1] = QMP_declare_send_relative(msg_mem_p[2*non_local_dir+1], wire[i]/2, 2*(wire[i]%2)-1, 0); non_local_dir++; } if (call_num==1 && !QMP_get_node_number()) printf("non_local_dir=%d\n",non_local_dir); if(non_local_dir) { multiple = QMP_declare_multiple(msg_handle_p, 2*non_local_dir); QMP_start(multiple); } dtime += dclock(); setup +=dtime; dtime = -dclock(); int if_print = 0; // if ( (call_num%10000==1) && (!QMP_get_node_number()) ) if_print=1; #define USE_TEST2 #ifdef USE_TEST2 //assume nt > n! static char *cname="mat()"; #pragma omp parallel default(shared) { int iam,nt,ipoints,istart,offset; iam = omp_get_thread_num(); nt = omp_get_num_threads(); int nt_dir = nt/n; int n_t = iam/nt_dir; int i_t = iam%nt_dir; if (n_t >= n ){ n_t = n-1; i_t = iam - (n-1)*nt_dir; nt_dir = nt -(n-1)*nt_dir; } int w_t = wire[n_t]; ipoints = (local_chi[w_t]/2)/nt_dir; offset = ipoints*i_t; if (i_t == (nt_dir-1)) ipoints = (local_chi[w_t]/2)-offset; if ( if_print ) printf("thread %d of %d nt_dir n_t i_t ipoints offset= %d %d %d %d %d\n",iam,nt,nt_dir,n_t,i_t,ipoints,offset); //Interleaving of local computation of matrix multiplication partrans_cmm_agg((uc_l[w_t]+offset*2),min[n_t],mout[n_t],ipoints); if ( if_print ) printf("thread %d of %d done\n",iam,nt); } #else { //Interleaving of local computation of matrix multiplication #pragma omp parallel for default(shared) for(i=0;i<n;i++){ partrans_cmm_agg(uc_l[wire[i]],min[i],mout[i],local_chi[wire[i]]/2); } } #endif dtime += dclock(); localt +=dtime; dtime = -dclock(); //#pragma omp barrier //#pragma omp master { if(non_local_dir) { QMP_status_t qmp_complete_status = QMP_wait(multiple); if (qmp_complete_status != QMP_SUCCESS) QMP_error("Send failed in vec_cb_norm: %s\n", QMP_error_string(qmp_complete_status)); QMP_free_msghandle(multiple); for(int i = 0; i < 2*non_local_dir; i++) QMP_free_msgmem(msg_mem_p[i]); // Free(msg_handle_p); // Free(msg_mem_p); } } //#pragma omp master { dtime += dclock(); qmp +=dtime; dtime = -dclock(); //Do non-local computations #ifdef USE_TEST2 //assume nt > n! #pragma omp parallel default(shared) { int iam,nt,ipoints,istart,offset; iam = omp_get_thread_num(); nt = omp_get_num_threads(); int nt_dir = nt/n; int n_t = iam/nt_dir; int i_t = iam%nt_dir; if (n_t >= n ){ n_t = n-1; i_t = iam - (n-1)*nt_dir; nt_dir = nt -(n-1)*nt_dir; } int w_t = wire[n_t]; ipoints = (non_local_chi[w_t]/2)/nt_dir; offset = ipoints*i_t; if (i_t == (nt_dir-1)) ipoints = (non_local_chi[w_t]/2)-offset; if ( if_print ) printf("thread %d of %d nt_dir n_t i_t ipoints offset= %d %d %d %d %d\n",iam,nt,nt_dir,n_t,i_t,ipoints,offset); //Non-local computation if (ipoints>0) partrans_cmm_agg((uc_nl[w_t]+offset*2),(matrix *)rcv_buf[w_t],mout[n_t],ipoints); if ( if_print ) printf("thread %d of %d done\n",iam,nt); } #else { #pragma omp parallel for for(i=0;i<n;i++) if (!local[wire[i]/2]) { #ifdef USE_OMP if (call_num%10000==1 && !QMP_get_node_number() ) printf("thread %d of %d i=%d\n",omp_get_thread_num(),omp_get_num_threads(),i); #endif partrans_cmm_agg(uc_nl[wire[i]],(matrix *)rcv_buf[wire[i]],mout[i],non_local_chi[wire[i]]/2); } }//#pragma omp parallel #endif dtime += dclock(); nonlocal +=dtime; if (call_num%100==0){ static char *cname="mat()"; if (!QMP_get_node_number() ) { print_flops("mat():local*100",0,localt); print_flops("mat():nonlocal*100",0,nonlocal); print_flops("mat():qmp*100",0,qmp); print_flops("mat():setup*100",0,setup); } localt=nonlocal=qmp=setup=0.; } #ifdef PROFILE dtime2 +=dclock(); print_flops("",fname,198*vol*n,dtime2); #endif // ParTrans::PTflops +=198*n*vol; }
//------------------------------------------------------------------ // "Odd" fermion force evolution routine written by Chris Dawson, taken // verbatim, so performance will suck on qcdoc. //------------------------------------------------------------------ ForceArg FdwfBase::EvolveMomFforce( Matrix* mom, // momenta Vector* phi, // odd pseudo-fermion field Vector* eta, // very odd pseudo-fermion field Float mass, Float dt ) { char *fname = "EvolveMomFforce(M*,V*,V*,F,F)"; VRB.Func(cname,fname); Matrix *gauge = GaugeField() ; if (Colors() != 3) { ERR.General(cname,fname,"Wrong nbr of colors.") ; } if (SpinComponents()!=4) { ERR.General(cname,fname,"Wrong nbr of spin comp.") ;} if (mom == 0) { ERR.Pointer(cname,fname,"mom") ; } if (phi == 0) { ERR.Pointer(cname,fname,"phi") ; } // allocate space for two CANONICAL fermion fields // these are all full fermion vector sizes ( i.e. *not* preconditioned ) const int f_size ( FsiteSize() * GJP.VolNodeSites() ); const int f_size_cb ( f_size/2 ) ; // f_size must be multiple of 2 const int f_site_size_4d( 2 * Colors() * SpinComponents() ); const int f_size_4d ( f_site_size_4d * GJP.VolNodeSites()) ; char *str_v1 = "v1" ; Vector *v1 = (Vector *)smalloc(f_size*sizeof(Float)) ; if (v1 == 0) ERR.Pointer(cname, fname, str_v1) ; VRB.Smalloc(cname, fname, str_v1, v1, f_size*sizeof(Float)) ; char *str_v2 = "v2" ; Vector *v2 = (Vector *)smalloc(f_size*sizeof(Float)) ; if (v2 == 0) ERR.Pointer(cname, fname, str_v2) ; VRB.Smalloc(cname, fname, str_v2, v2, f_size*sizeof(Float)) ; Float L1 = 0.0; Float L2 = 0.0; Float Linf = 0.0; #ifdef PROFILE Float time = -dclock(); ForceFlops=0; #endif //Calculate v1, v2. Both must be in CANONICAL order afterwards { CgArg cg_arg ; cg_arg.mass = mass ; DiracOpDwf dwf(*this, v1, v2, &cg_arg, CNV_FRM_YES) ; Float kappa( 1.0 / ( 2 * (4 + GJP.DwfA5Inv() - GJP.DwfHeight()))); v2->CopyVec(phi,f_size_cb); // rescale the input field. As the second half of the this field // will be constructed by acting with the PC dslash on v1, this // rescales *one* of the full vectors - giving rise to an overall // rescaling of the final answer by exactly -\kappa^2 v2->VecTimesEquFloat(-kappa*kappa,f_size_cb); // only need one factor of -\kappa^2, so don't rescale the second // full vector (v2) v1->CopyVec(eta,f_size_cb); dwf.Dslash(v2+(f_size_cb/6), v2 , CHKB_ODD, DAG_YES); dwf.Dslash(v1+(f_size_cb/6), v1 , CHKB_ODD, DAG_NO); // v1 and v2 are now the vectors needed to contruct the force term // written in ( ODD, EVEN ) ordering. They will be converted back // into canonical ordering when the destructor is called. } // two fermion vectors at a single position // - these will be used to store off-node // field components char *str_site_v1 = "site_v1" ; Float *site_v1 = (Float *)smalloc(FsiteSize()*sizeof(Float)) ; if (site_v1 == 0) ERR.Pointer(cname, fname, str_site_v1) ; VRB.Smalloc(cname, fname, str_site_v1, site_v1, FsiteSize()*sizeof(Float)) ; char *str_site_v2 = "site_v2" ; Float *site_v2 = (Float *)smalloc(FsiteSize()*sizeof(Float)) ; if (site_v2 == 0) ERR.Pointer(cname, fname, str_site_v2) ; VRB.Smalloc(cname, fname, str_site_v2, site_v2, FsiteSize()*sizeof(Float)) ; // evolve the momenta by the fermion force int mu, x, y, z, t, s; const int lx(GJP.XnodeSites()); const int ly(GJP.YnodeSites()); const int lz(GJP.ZnodeSites()); const int lt(GJP.TnodeSites()); const int ls(GJP.SnodeSites()); // start by summing first over direction (mu) and then over site to // allow SCU transfers to happen face-by-face in the outermost loop. VRB.Clock(cname, fname, "Before loop over links.\n") ; for (mu=0; mu<4; mu++) { for (t=0; t<lt; t++){ for (z=0; z<lz; z++){ for (y=0; y<ly; y++){ for (x=0; x<lx; x++) { // position offset int gauge_offset = x+lx*(y+ly*(z+lz*t)); // offset for vector field at this point // (4d only, no fifth dimension) int vec_offset = f_site_size_4d*gauge_offset ; // offset for link in mu direction from this point gauge_offset = mu+4*gauge_offset ; Float *v1_plus_mu=NULL ; Float *v2_plus_mu=NULL ; int vec_plus_mu_stride=0 ; int vec_plus_mu_offset = f_site_size_4d ; // sign of coeff (look at momenta update) Float coeff = -2.0 * dt ; switch (mu) { case 0 : // next position in mu direction vec_plus_mu_offset *= (x+1)%lx+lx*(y+ly*(z+lz*t)) ; // vec_plus_mu_offset now the correct // offset for a fermion field at this point // in the lattice if ((x+1) == lx) { // off-node for (s=0; s<ls; s++) { // fill site_v1 and site_v2 with v1 and v2 data // from x=0 on next node, need loop because // data is not contiguous in memory getPlusData( (Float *)site_v1+s*f_site_size_4d, (Float *)v1+vec_plus_mu_offset+s*f_size_4d, f_site_size_4d, mu) ; getPlusData( (Float*)site_v2+s*f_site_size_4d, (Float*)v2+vec_plus_mu_offset+s*f_size_4d, f_site_size_4d, mu) ; } // end for s v1_plus_mu = site_v1 ; v2_plus_mu = site_v2 ; vec_plus_mu_stride = 0 ; // field now contiguous // GJP.XnodeBc() gives the forward boundary // condition only (so this should work). if (GJP.XnodeBc()==BND_CND_APRD) coeff = -coeff ; } else { // on - node // // just add offset to v1 and v2 // (they are now 1 forward in the mu direction ) // v1_plus_mu = (Float*)v1+vec_plus_mu_offset ; v2_plus_mu = (Float*)v2+vec_plus_mu_offset ; vec_plus_mu_stride = f_size_4d - f_site_size_4d ; // explained below } break ; // Repeat for the other directions case 1 : vec_plus_mu_offset *= x+lx*((y+1)%ly+ly*(z+lz*t)) ; if ((y+1) == ly) { for (s=0; s<ls; s++) { getPlusData( (Float*)site_v1+s*f_site_size_4d, (Float*)v1+vec_plus_mu_offset+s*f_size_4d, f_site_size_4d, mu) ; getPlusData( (Float*)site_v2+s*f_site_size_4d, (Float*)v2+vec_plus_mu_offset+s*f_size_4d, f_site_size_4d, mu) ; } v1_plus_mu = site_v1 ; v2_plus_mu = site_v2 ; vec_plus_mu_stride = 0 ; if (GJP.YnodeBc()==BND_CND_APRD) coeff = -coeff ; } else { v1_plus_mu = (Float *)v1+vec_plus_mu_offset ; v2_plus_mu = (Float *)v2+vec_plus_mu_offset ; vec_plus_mu_stride = f_size_4d - f_site_size_4d ; } break ; case 2 : vec_plus_mu_offset *= x+lx*(y+ly*((z+1)%lz+lz*t)) ; if ((z+1) == lz) { for (s=0; s<ls; s++) { getPlusData( (Float*)site_v1+s*f_site_size_4d, (Float*)v1+vec_plus_mu_offset+s*f_size_4d, f_site_size_4d, mu) ; getPlusData( (Float*)site_v2+s*f_site_size_4d, (Float*)v2+vec_plus_mu_offset+s*f_size_4d, f_site_size_4d, mu) ; } v1_plus_mu = site_v1 ; v2_plus_mu = site_v2 ; vec_plus_mu_stride = 0 ; if (GJP.ZnodeBc()==BND_CND_APRD) coeff = -coeff ; } else { v1_plus_mu = (Float *)v1+vec_plus_mu_offset ; v2_plus_mu = (Float *)v2+vec_plus_mu_offset ; vec_plus_mu_stride = f_size_4d - f_site_size_4d ; } break ; case 3 : vec_plus_mu_offset *= x+lx*(y+ly*(z+lz*((t+1)%lt))) ; if ((t+1) == lt) { for (s=0; s<ls; s++) { getPlusData( (Float*)site_v1+s*f_site_size_4d, (Float*)v1+vec_plus_mu_offset+s*f_size_4d, f_site_size_4d, mu) ; getPlusData( (Float*)site_v2+s*f_site_size_4d, (Float*)v2+vec_plus_mu_offset+s*f_size_4d, f_site_size_4d, mu) ; } v1_plus_mu = site_v1 ; v2_plus_mu = site_v2 ; vec_plus_mu_stride = 0 ; if (GJP.TnodeBc()==BND_CND_APRD) coeff = -coeff ; } else { v1_plus_mu = (Float *)v1+vec_plus_mu_offset ; v2_plus_mu = (Float *)v2+vec_plus_mu_offset ; vec_plus_mu_stride = f_size_4d - f_site_size_4d ; } } // end (the evil) mu switch Matrix tmp_mat1, tmp_mat2; // the non-zero stride pattern is due to domain wall // fermions ( summing up *ls* different sproj's ) // // f_size_4d-f_site_size_4d is the number of floats // between the end of one spinor at s and the start of the // spinor at s+1 // // vec_plus_mu_stride is the same, except when // this is off boundary, in that case the info // is copied into a contiguous block in the above code // and vec_plus_mu_stride set to zero // ( 1 - gamma_\mu ) Tr_s [ v1(x+\mu) v2^{\dagger}(x) ] sproj_tr[mu]( (Float *)&tmp_mat1, (Float *)v1_plus_mu, (Float *)v2+vec_offset, ls, vec_plus_mu_stride, f_size_4d-f_site_size_4d) ; // (1 + gamma_\mu) Tr_s [ v2(x+\mu) v1^{\dagger}(x) ] sproj_tr[mu+4]( (Float *)&tmp_mat2, (Float *)v2_plus_mu, (Float *)v1+vec_offset, ls, vec_plus_mu_stride, f_size_4d-f_site_size_4d) ; // exactly what is sounds like tmp_mat1 += tmp_mat2 ; if(GJP.Snodes() != 1) { for (s=0; s<(sizeof(Matrix)/sizeof(Float)); ++s) { glb_sum_dir((Float *)&tmp_mat1 + s, 4) ; } } // multiply sum by the link in the \mu direction tmp_mat2.DotMEqual(*(gauge+gauge_offset), tmp_mat1) ; // take tracless antihermitian piece // TrLessAntiHermMatrix need to be passed // the dagger of the matrix in question tmp_mat1.Dagger(tmp_mat2) ; tmp_mat2.TrLessAntiHermMatrix(tmp_mat1) ; tmp_mat2 *= coeff ; // note the minus sign. *(mom+gauge_offset) -= tmp_mat2 ; Float norm = tmp_mat2.norm(); Float tmp = sqrt(norm); L1 += tmp; L2 += norm; Linf = (tmp>Linf ? tmp : Linf); } // end for x } // end for y } // end for z } // end for t } // end for mu ForceFlops += (2*9*16*ls + 18+ 198+36+24)*lx*ly*lz*lt*4; #ifdef PROFILE time += dclock(); print_flops(cname,fname,ForceFlops,time); #endif // deallocate smalloc'd space VRB.Sfree(cname, fname, str_site_v2, site_v2) ; sfree(site_v2) ; VRB.Sfree(cname, fname, str_site_v1, site_v1) ; sfree(site_v1) ; VRB.Sfree(cname, fname, str_v2, v2) ; sfree(v2) ; VRB.Sfree(cname, fname, str_v1, v1) ; sfree(v1) ; glb_sum(&L1); glb_sum(&L2); glb_max(&Linf); L1 /= 4.0*GJP.VolSites(); L2 /= 4.0*GJP.VolSites(); VRB.FuncEnd(cname,fname); return ForceArg(L1, sqrt(L2), Linf); }
CPS_START_NAMESPACE /*!\file \brief Implementation of FdwfBase class. $Id: f_dwf_base_force.C,v 1.14 2012-08-31 04:55:08 chulwoo Exp $ */ //-------------------------------------------------------------------- // CVS keywords // // $Source: /home/chulwoo/CPS/repo/CVS/cps_only/cps_pp/src/util/lattice/f_dwf_base/noarch/f_dwf_base_force.C,v $ // $State: Exp $ // //-------------------------------------------------------------------- //------------------------------------------------------------------ // // f_dwf_base_force.C // // (R)HMC force term for FdwfBase // //------------------------------------------------------------------ CPS_END_NAMESPACE #include <util/qcdio.h> #include <math.h> #include <util/lattice.h> #include <util/dirac_op.h> #include <util/dwf.h> #include <util/gjp.h> #include <util/verbose.h> #include <util/vector.h> #include <util/random.h> #include <util/error.h> #include <util/time_cps.h> #include <comms/scu.h> // GRF #include <comms/glb.h> CPS_START_NAMESPACE #undef PROFILE // CJ: change start //------------------------------------------------------------------ // EvolveMomFforce(Matrix *mom, Vector *chi, Float mass, // Float dt): // It evolves the canonical momentum mom by dt // using the fermion force. //------------------------------------------------------------------ ForceArg FdwfBase::EvolveMomFforce(Matrix *mom, Vector *chi, Float mass, Float dt){ char *fname = "EvolveMomFforce(M*,V*,F,F,F)"; VRB.Func(cname,fname); Matrix *gauge = GaugeField() ; if (Colors() != 3) ERR.General(cname,fname,"Wrong nbr of colors.") ; if (SpinComponents() != 4) ERR.General(cname,fname,"Wrong nbr of spin comp.") ; if (mom == 0) ERR.Pointer(cname,fname,"mom") ; if (chi == 0) ERR.Pointer(cname,fname,"chi") ; //---------------------------------------------------------------- // allocate space for two CANONICAL fermion fields //---------------------------------------------------------------- int f_size = FsiteSize() * GJP.VolNodeSites() ; int f_site_size_4d = 2 * Colors() * SpinComponents(); int f_size_4d = f_site_size_4d * GJP.VolNodeSites() ; char *str_v1 = "v1" ; Vector *v1 = (Vector *)smalloc(f_size*sizeof(Float)) ; if (v1 == 0) ERR.Pointer(cname, fname, str_v1) ; VRB.Smalloc(cname, fname, str_v1, v1, f_size*sizeof(Float)) ; char *str_v2 = "v2" ; Vector *v2 = (Vector *)smalloc(f_size*sizeof(Float)) ; if (v2 == 0) ERR.Pointer(cname, fname, str_v2) ; VRB.Smalloc(cname, fname, str_v2, v2, f_size*sizeof(Float)) ; //---------------------------------------------------------------- // allocate buffer space for two fermion fields that are assoc // with only one 4-D site. //---------------------------------------------------------------- char *str_site_v1 = "site_v1" ; Float *site_v1 = (Float *)smalloc(FsiteSize()*sizeof(Float)) ; if (site_v1 == 0) ERR.Pointer(cname, fname, str_site_v1) ; VRB.Smalloc(cname, fname, str_site_v1, site_v1, FsiteSize()*sizeof(Float)) ; char *str_site_v2 = "site_v2" ; Float *site_v2 = (Float *)smalloc(FsiteSize()*sizeof(Float)) ; if (site_v2 == 0) ERR.Pointer(cname, fname, str_site_v2) ; VRB.Smalloc(cname, fname, str_site_v2, site_v2, FsiteSize()*sizeof(Float)) ; Float L1 = 0.0; Float L2 = 0.0; Float Linf = 0.0; //---------------------------------------------------------------- // Calculate v1, v2. Both v1, v2 must be in CANONICAL order after // the calculation. //---------------------------------------------------------------- VRB.Clock(cname, fname, "Before calc force vecs.\n") ; VRB.Flow(cname, fname, "Before calc force vecs.\n") ; { CgArg cg_arg ; cg_arg.mass = mass ; DiracOpDwf dwf(*this, v1, v2, &cg_arg, CNV_FRM_YES) ; dwf.CalcHmdForceVecs(chi) ; } VRB.Flow(cname, fname, "After calc force vecs.\n") ; #ifdef PROFILE Float time = -dclock(); ForceFlops=0; #endif int mu, x, y, z, t, s, lx, ly, lz, lt, ls ; lx = GJP.XnodeSites() ; ly = GJP.YnodeSites() ; lz = GJP.ZnodeSites() ; lt = GJP.TnodeSites() ; ls = GJP.SnodeSites() ; Matrix tmp_mat1, tmp_mat2 ; //------------------------------------------------------------------ // start by summing first over direction (mu) and then over site // to allow SCU transfers to happen face-by-face in the outermost // loop. //------------------------------------------------------------------ VRB.Clock(cname, fname, "Before loop over links.\n") ; for (mu=0; mu<4; mu++){ for (t=0; t<lt; t++){ for (z=0; z<lz; z++){ for (y=0; y<ly; y++){ for (x=0; x<lx; x++){ int gauge_offset = x+lx*(y+ly*(z+lz*t)) ; int vec_offset = f_site_size_4d*gauge_offset ; gauge_offset = mu+4*gauge_offset ; Float *v1_plus_mu ; Float *v2_plus_mu ; int vec_plus_mu_stride ; int vec_plus_mu_offset = f_site_size_4d ; Float coeff = -2.0 * dt ; switch (mu) { case 0 : vec_plus_mu_offset *= (x+1)%lx+lx*(y+ly*(z+lz*t)) ; if ((x+1) == lx) { for (s=0; s<ls; s++) { getPlusData( (IFloat *)site_v1+s*f_site_size_4d, (IFloat *)v1+vec_plus_mu_offset+s*f_size_4d, f_site_size_4d, mu) ; getPlusData( (IFloat *)site_v2+s*f_site_size_4d, (IFloat *)v2+vec_plus_mu_offset+s*f_size_4d, f_site_size_4d, mu) ; } // end for s v1_plus_mu = site_v1 ; v2_plus_mu = site_v2 ; vec_plus_mu_stride = 0 ; if (GJP.XnodeBc()==BND_CND_APRD) coeff = -coeff ; } else { v1_plus_mu = (Float *)v1+vec_plus_mu_offset ; v2_plus_mu = (Float *)v2+vec_plus_mu_offset ; vec_plus_mu_stride = f_size_4d - f_site_size_4d ; } break ; case 1 : vec_plus_mu_offset *= x+lx*((y+1)%ly+ly*(z+lz*t)) ; if ((y+1) == ly) { for (s=0; s<ls; s++) { getPlusData( (IFloat *)site_v1+s*f_site_size_4d, (IFloat *)v1+vec_plus_mu_offset+s*f_size_4d, f_site_size_4d, mu) ; getPlusData( (IFloat *)site_v2+s*f_site_size_4d, (IFloat *)v2+vec_plus_mu_offset+s*f_size_4d, f_site_size_4d, mu) ; } // end for s v1_plus_mu = site_v1 ; v2_plus_mu = site_v2 ; vec_plus_mu_stride = 0 ; if (GJP.YnodeBc()==BND_CND_APRD) coeff = -coeff ; } else { v1_plus_mu = (Float *)v1+vec_plus_mu_offset ; v2_plus_mu = (Float *)v2+vec_plus_mu_offset ; vec_plus_mu_stride = f_size_4d - f_site_size_4d ; } break ; case 2 : vec_plus_mu_offset *= x+lx*(y+ly*((z+1)%lz+lz*t)) ; if ((z+1) == lz) { for (s=0; s<ls; s++) { getPlusData( (IFloat *)site_v1+s*f_site_size_4d, (IFloat *)v1+vec_plus_mu_offset+s*f_size_4d, f_site_size_4d, mu) ; getPlusData( (IFloat *)site_v2+s*f_site_size_4d, (IFloat *)v2+vec_plus_mu_offset+s*f_size_4d, f_site_size_4d, mu) ; } // end for s v1_plus_mu = site_v1 ; v2_plus_mu = site_v2 ; vec_plus_mu_stride = 0 ; if (GJP.ZnodeBc()==BND_CND_APRD) coeff = -coeff ; } else { v1_plus_mu = (Float *)v1+vec_plus_mu_offset ; v2_plus_mu = (Float *)v2+vec_plus_mu_offset ; vec_plus_mu_stride = f_size_4d - f_site_size_4d ; } break ; case 3 : vec_plus_mu_offset *= x+lx*(y+ly*(z+lz*((t+1)%lt))) ; if ((t+1) == lt) { for (s=0; s<ls; s++) { getPlusData( (IFloat *)site_v1+s*f_site_size_4d, (IFloat *)v1+vec_plus_mu_offset+s*f_size_4d, f_site_size_4d, mu) ; getPlusData( (IFloat *)site_v2+s*f_site_size_4d, (IFloat *)v2+vec_plus_mu_offset+s*f_size_4d, f_site_size_4d, mu) ; } // end for s v1_plus_mu = site_v1 ; v2_plus_mu = site_v2 ; vec_plus_mu_stride = 0 ; if (GJP.TnodeBc()==BND_CND_APRD) coeff = -coeff ; } else { v1_plus_mu = (Float *)v1+vec_plus_mu_offset ; v2_plus_mu = (Float *)v2+vec_plus_mu_offset ; vec_plus_mu_stride = f_size_4d - f_site_size_4d ; } } // end switch mu sproj_tr[mu]( (IFloat *)&tmp_mat1, (IFloat *)v1_plus_mu, (IFloat *)v2+vec_offset, ls, vec_plus_mu_stride, f_size_4d-f_site_size_4d) ; sproj_tr[mu+4]( (IFloat *)&tmp_mat2, (IFloat *)v2_plus_mu, (IFloat *)v1+vec_offset, ls, vec_plus_mu_stride, f_size_4d-f_site_size_4d) ; tmp_mat1 += tmp_mat2 ; // If GJP.Snodes > 1 sum up contributions from all s nodes if(GJP.Snodes() > 1) { // if(!UniqueID())printf("%s::%s:GJP.Snodes()=%d\n",cname,fname,GJP.Snodes()); glb_sum_multi_dir((Float *)&tmp_mat1,4,sizeof(Matrix)/sizeof(IFloat)); } tmp_mat2.DotMEqual(*(gauge+gauge_offset), tmp_mat1) ; tmp_mat1.Dagger(tmp_mat2) ; tmp_mat2.TrLessAntiHermMatrix(tmp_mat1) ; tmp_mat2 *= coeff ; *(mom+gauge_offset) += tmp_mat2 ; Float norm = tmp_mat2.norm(); Float tmp = sqrt(norm); L1 += tmp; L2 += norm; Linf = (tmp>Linf ? tmp : Linf); } } } } // end for x,y,z,t } // end for mu ForceFlops += (2*9*16*ls + 18+ 198+36+24)*lx*ly*lz*lt*4; #ifdef PROFILE time += dclock(); print_flops(cname,fname,ForceFlops,time); #endif //------------------------------------------------------------------ // deallocate smalloc'd space //------------------------------------------------------------------ VRB.Sfree(cname, fname, str_site_v2, site_v2) ; sfree(site_v2) ; VRB.Sfree(cname, fname, str_site_v1, site_v1) ; sfree(site_v1) ; VRB.Sfree(cname, fname, str_v2, v2) ; sfree(v2) ; VRB.Sfree(cname, fname, str_v1, v1) ; sfree(v1) ; glb_sum(&L1); glb_sum(&L2); glb_max(&Linf); L1 /= 4.0*GJP.VolSites(); L2 /= 4.0*GJP.VolSites(); VRB.FuncEnd(cname,fname); return ForceArg(L1, sqrt(L2), Linf); }
int main(int argc,char *argv[]){ #if TARGET == QCDOC DefaultSetup(); printf("Sizes = %d %d %d %d %d %d\n",SizeX(),SizeY(),SizeZ(),SizeT(),SizeS(),SizeW()); printf("Coors = %d %d %d %d %d %d\n",CoorX(),CoorY(),CoorZ(),CoorT(),CoorS(),CoorW()); #endif FILE *fp; double dtime; //---------------------------------------------------------------- // Initializes all Global Job Parameters //---------------------------------------------------------------- DoArg do_arg; int nx,ny,nz,nt; if (argc < 5){ ERR.General("f_stag_test","main()","usage: %s nx ny nz nt\n",argv[0]); } sscanf(argv[1],"%d",&nx); sscanf(argv[2],"%d",&ny); sscanf(argv[3],"%d",&nz); sscanf(argv[4],"%d",&nt); printf("total sites = %d %d %d %d\n",nx,ny,nz,nt); #if TARGET == QCDOC do_arg.x_node_sites = nx/SizeX(); do_arg.y_node_sites = ny/SizeY(); do_arg.z_node_sites = nz/SizeZ(); do_arg.t_node_sites = nt/SizeT(); do_arg.s_node_sites = 1; do_arg.x_nodes = SizeX(); do_arg.y_nodes = SizeY(); do_arg.z_nodes = SizeZ(); do_arg.t_nodes = SizeT(); do_arg.s_nodes = 1; #else do_arg.x_node_sites = nx; do_arg.y_node_sites = ny; do_arg.z_node_sites = nz; do_arg.t_node_sites = nt; do_arg.s_node_sites = 0; do_arg.x_nodes = 1; do_arg.y_nodes = 1; do_arg.z_nodes = 1; do_arg.t_nodes = 1; do_arg.s_nodes = 1; #endif do_arg.x_bc = BND_CND_PRD; do_arg.y_bc = BND_CND_PRD; do_arg.z_bc = BND_CND_PRD; do_arg.t_bc = BND_CND_APRD; do_arg.start_conf_kind = START_CONF_DISORD; do_arg.start_seed_kind = START_SEED_FIXED; // do_arg.colors = 3; do_arg.beta = 5.5; do_arg.dwf_height = 0.9; do_arg.clover_coeff = 2.0171; // do_arg.verbose_level = -1205; CgArg cg_arg; cg_arg.mass = 0.1; cg_arg.stop_rsd = 1e-12; cg_arg.max_num_iter = 500; GJP.Initialize(do_arg); // VRB.Level(GJP.VerboseLevel()); VRB.Level(0); VRB.ActivateLevel(VERBOSE_FUNC_LEVEL); VRB.ActivateLevel(VERBOSE_FLOW_LEVEL); VRB.ActivateLevel(VERBOSE_RNGSEED_LEVEL); #if TARGET == QCDOC char filename [200]; sprintf(filename,"%s%d%d%d%d%d%d_%d%d%d%d%d%d.out",f_stag_test_filename,SizeX(),SizeY(),SizeZ(),SizeT(),SizeS(),SizeW(),CoorX(),CoorY(),CoorZ(),CoorT(),CoorS(),CoorW()); fp = Fopen(filename,"w"); #else fp = Fopen("f_stag_test.out","w"); #endif GwilsonFstag lat; Vector *result = (Vector*)smalloc(GJP.VolNodeSites()*lat.FsiteSize()*sizeof(IFloat)); Vector *X_out = (Vector*)smalloc(GJP.VolNodeSites()*lat.FsiteSize()*sizeof(IFloat)); Vector *X_out2 = (Vector*)smalloc(GJP.VolNodeSites()*lat.FsiteSize()*sizeof(IFloat)); if(!result) ERR.Pointer("","","result"); if(!X_out) ERR.Pointer("","","X_out"); if(!X_out2) ERR.Pointer("","","X_out2"); Vector *X_out_odd = &(X_out[GJP.VolNodeSites()/2]); int s[4]; Vector *X_in = (Vector*)smalloc(GJP.VolNodeSites()*lat.FsiteSize()*sizeof(IFloat)); if(!X_in) ERR.Pointer("","","X_in"); #if 1 lat.RandGaussVector(X_in,1.0); #else Vector *X_in_odd = &(X_in[GJP.VolNodeSites()/2]); Matrix *gf = lat.GaugeField(); IFloat *gf_p = (IFloat *)lat.GaugeField(); for(s[3]=0; s[3]<GJP.NodeSites(3); s[3]++) for(s[2]=0; s[2]<GJP.NodeSites(2); s[2]++) for(s[1]=0; s[1]<GJP.NodeSites(1); s[1]++) for(s[0]=0; s[0]<GJP.NodeSites(0); s[0]++) { int n = lat.FsiteOffset(s); IFloat *temp_p = (IFloat *)(gf+4*n+3); IFloat crd = 1.0*s[0]+0.1*s[1]+0.01*s[2]+0.001*s[3]; #if TARGET==QCDOC if(CoorX()==0 && CoorY()==0 && CoorZ()==0 && CoorT()==0 &&n==0) crd=1.0; else crd = 0.0; #else if(n==0) crd = 1.0; else crd = 0.0; #endif for(int v=0; v<6; v+=2){ if (v==0) *((IFloat*)&X_in[n]+v) = crd; else *((IFloat*)&X_in[n]+v) = 0; *((IFloat*)&X_in[n]+v+1) = 0.0; } } #endif Vector *out; DiracOpStag dirac(lat,X_out,X_in,&cg_arg,CNV_FRM_NO); for(int k = 0; k< 1; k++){ printf("k=%d ",k); if (k ==0) out = result; else out = X_out; bzero((char *)out, GJP.VolNodeSites()*lat.FsiteSize()*sizeof(IFloat)); lat.Fconvert(out,STAG,CANONICAL); lat.Fconvert(X_in,STAG,CANONICAL); int offset = GJP.VolNodeSites()*lat.FsiteSize()/ (2*6); #if 1 #if TARGET==QCDOC int vol = nx*ny*nz*nt/(SizeX()*SizeY()*SizeZ()*SizeT()); #else int vol = nx*ny*nz*nt; #endif dtime = -dclock(); int iter = dirac.MatInv(out,X_in); dtime +=dclock(); print_flops(606*iter*vol,dtime); printf("iter=%d\n",iter); #else dirac.Dslash(out,X_in+offset,CHKB_ODD,DAG_NO); dirac.Dslash(out+offset,X_in,CHKB_EVEN,DAG_NO); #endif if (k == 0){ bzero((char *)X_out2, GJP.VolNodeSites()*lat.FsiteSize()*sizeof(IFloat)); dirac.Dslash(X_out2,out+offset,CHKB_ODD,DAG_NO); dirac.Dslash(X_out2+offset,out,CHKB_EVEN,DAG_NO); lat.Fconvert(X_out2,CANONICAL,STAG); } lat.Fconvert(out,CANONICAL,STAG); lat.Fconvert(X_in,CANONICAL,STAG); X_out2->FTimesV1PlusV2(2*cg_arg.mass,out,X_out2,GJP.VolNodeSites ()*lat.FsiteSize()); Float dummy; Float dt = 2; for(s[3]=0; s[3]<GJP.NodeSites(3); s[3]++) for(s[2]=0; s[2]<GJP.NodeSites(2); s[2]++) for(s[1]=0; s[1]<GJP.NodeSites(1); s[1]++) for(s[0]=0; s[0]<GJP.NodeSites(0); s[0]++) { int n = lat.FsiteOffset(s); for(int i=0; i<3; i++){ #if TARGET == QCDOC if ( k==0 ) Fprintf(fp," %d %d %d %d %d ", CoorX()*GJP.NodeSites(0)+s[0], CoorY()*GJP.NodeSites(1)+s[1], CoorZ()*GJP.NodeSites(2)+s[2], CoorT()*GJP.NodeSites(3)+s[3], i); #else if ( k==0 ) Fprintf(fp," %d %d %d %d %d ", s[0], s[1], s[2], s[3], i); #endif if ( k==0 ) Fprintf(fp," (%0.7e %0.7e) (%0.7e %0.7e)", *((IFloat*)&result[n]+i*2), *((IFloat*)&result[n]+i*2+1), *((IFloat*)&X_in[n]+i*2), *((IFloat*)&X_in[n]+i*2+1)); #if 1 Fprintf(fp,"\n"); #else Fprintf(fp," (%0.2e %0.2e)\n", *((IFloat*)&X_out2[n]+i*2)-*((IFloat*)&X_in[n]+i*2), *((IFloat*)&X_out2[n]+i*2+1)-*((IFloat*)&X_in[n]+i* 2+1)); #endif } } } Fclose(fp); sfree(X_in); sfree(result); sfree(X_out); sfree(X_out2); return 0; }
ForceArg Fp4::EvolveMomFforce(Matrix *mom, Vector *frm, Float mass, Float dt){ char *fname = "EvolveMomFforce(M*,V*,F,F,F)"; ERR.NotImplemented(cname,fname); ForceArg Fdt; #if 0 VRB.Func(cname,fname); #ifdef PROFILE Float dtime; ParTrans::PTflops=0; ForceFlops=0; #endif size_t size; // int nflops=0; static int vax_len = 0; if (vax_len == 0) vax_len = GJP.VolNodeSites()*VECT_LEN/VAXPY_UNROLL; size = GJP.VolNodeSites()/2*FsiteSize()*sizeof(Float); Vector *X = (Vector *)smalloc(2*size); // printf("X=%p\n",X); Vector *X_e = X; // even sites Vector *X_o = X+GJP.VolNodeSites()/2; // odd sites // The argument frm should have the CG solution. // The FstagTypes protected pointer f_tmp should contain Dslash frm moveMem(X_e, frm, size); #ifdef DEBUGGING f_tmp = frm+GJP.VolNodeSites()/2; // debugging only #endif moveMem(X_o, f_tmp, size); Fconvert(X, CANONICAL, STAG); Convert(STAG); // Puts staggered phases into gauge field. int N; // N can be 1, 2 or 4. N = 4; if (GJP.VolNodeSites()>256) N = 2; else if (GJP.VolNodeSites()>512) N = 1; VRB.Flow(cname,fname,"N=%d\n",N); enum{plus=0, minus=1, n_sign=2}; // Array in which to accumulate the force term: // this must be initialised to zero #if 0 Matrix **force = (Matrix**)amalloc(sizeof(Matrix), 2, 4, GJP.VolNodeSites()); if(!force) ERR.Pointer(cname, fname, "force"); #else size = GJP.VolNodeSites()*sizeof(Matrix); Matrix *force[4]; for(int i = 0;i<4;i++) force[i] = (Matrix *)v_alloc("force[i]",size); #endif for(int i=0; i<4; i++) for(int s=0; s<GJP.VolNodeSites(); s++) force[i][s].ZeroMatrix(); ParTransAsqtad parallel_transport(*this); // Vector arrays for which we must allocate memory #if 0 Vector ***Pnu = (Vector***)amalloc(sizeof(Vector), 3, n_sign, N, GJP.VolNodeSites()); if(!Pnu) ERR.Pointer(cname, fname, "Pnu"); Vector ****P3 = (Vector****)amalloc(sizeof(Vector), 4, n_sign, n_sign, N, GJP.VolNodeSites()); if(!P3) ERR.Pointer(cname, fname, "P3"); Vector ****Prhonu = (Vector****)amalloc(sizeof(Vector), 4, n_sign, n_sign, N, GJP.VolNodeSites()); if(!Prhonu) ERR.Pointer(cname, fname, "Prhonu"); Vector *****P5 = (Vector*****)amalloc(sizeof(Vector), 5, n_sign, n_sign, n_sign, N, GJP.VolNodeSites()); if(!P5) ERR.Pointer(cname, fname, "P5"); Vector ******P7 = (Vector******)amalloc(sizeof(Vector), 6, n_sign, n_sign, n_sign, n_sign, N, GJP.VolNodeSites()); if(!P7) ERR.Pointer(cname, fname, "P7"); Vector ******Psigma7 = (Vector******)amalloc(sizeof(Vector), 6, n_sign, n_sign, n_sign, n_sign, N, GJP.VolNodeSites()); if(!Psigma7) ERR.Pointer(cname, fname, "Psigma7"); // These vectors can be overlapped with previously allocated memory Vector **Pnununu = Prhonu[0][0]; Vector ***Pnunu = Psigma7[0][0][0];; Vector ****Pnu5 = P7[0][0]; Vector ****Pnu3 = P7[0][0]; Vector *****Prho5 = Psigma7[0]; Vector *****Psigmarhonu = Psigma7[0]; #else size = GJP.VolNodeSites()*sizeof(Vector); Vector *Pnu[n_sign][N]; Vector *P3[n_sign][n_sign][N]; Vector *Prhonu[n_sign][n_sign][N]; Vector *P5[n_sign][n_sign][n_sign][N]; Vector *P7[n_sign][n_sign][n_sign][n_sign][N]; Vector *Psigma7[n_sign][n_sign][n_sign][n_sign][N]; Vector *Pnununu[N]; Vector *Pnunu[n_sign][N]; Vector *Pnu5[n_sign][n_sign][N]; Vector *Pnu3[n_sign][n_sign][N]; Vector *Prho5[n_sign][n_sign][n_sign][N]; Vector *Psigmarhonu[n_sign][n_sign][n_sign][N]; //printf("Pnu=%p Psigmarhonu=%p\n",Pnu,Psigmarhonu); for(int w = 0;w<N;w++){ for(int i = 0;i<n_sign;i++){ Pnu[i][w]= (Vector *)v_alloc("Pnu",size); for(int j = 0;j<n_sign;j++){ P3[i][j][w]= (Vector *)v_alloc("P3",size); Prhonu[i][j][w]= (Vector *)v_alloc("Prhonu",size); for(int k = 0;k<n_sign;k++){ P5[i][j][k][w]= (Vector *)v_alloc("P5",size); for(int l = 0;l<n_sign;l++){ P7[i][j][k][l][w]= (Vector *)v_alloc("P7",size); Psigma7[i][j][k][l][w]= (Vector *)v_alloc("Psigma7",size); } Prho5[i][j][k][w] = Psigma7[0][i][j][k][w]; Psigmarhonu[i][j][k][w] = Psigma7[0][i][j][k][w]; } Pnu5[i][j][w]=P7[0][0][i][j][w]; Pnu3[i][j][w]=P7[0][0][i][j][w]; } Pnunu[i][w]=Psigma7[0][0][0][i][w]; } Pnununu[w]=Prhonu[0][0][w]; } #endif // input/output arrays for the parallel transport routines Vector *vin[n_sign*N], *vout[n_sign*N]; int dir[n_sign*N]; int mu[N], nu[N], rho[N], sigma[N]; // Sets of directions int w; // The direction index 0...N-1 int ms, ns, rs, ss; // Sign of direction bool done[4] = {false,false,false,false}; // Flags to tell us which // nu directions we have done. #ifdef PROFILE dtime = -dclock(); #endif for (int m=0; m<4; m+=N){ // Loop over mu for(w=0; w<N; w++) mu[w] = (m+w)%4; for (int n=m+1; n<m+4; n++){ // Loop over nu for(w=0; w<N; w++) nu[w] = (n+w)%4; // Pnu = U_nu X for(int i=0; i<N; i++){ vin[i] = vin[i+N] = X; dir[n_sign*i] = n_sign*nu[i]+plus; // nu_i dir[n_sign*i+1] = n_sign*nu[i]+minus; // -nu_i vout[n_sign*i] = Pnu[minus][i]; vout[n_sign*i+1] = Pnu[plus][i]; } parallel_transport.run(n_sign*N, vout, vin, dir); // P3 = U_mu Pnu // ms is the nu sign index, ms is the mu sign index, // w is the direction index for(int i=0; i<N; i++){ dir[n_sign*i] = n_sign*mu[i]+plus; // mu_i dir[n_sign*i+1] = n_sign*mu[i]+minus; // -mu_i } for(ns=0; ns<n_sign; ns++){ // ns is the sign of nu for(int i=0; i<N; i++){ vin[n_sign*i] = vin[n_sign*i+1] = Pnu[ns][i]; vout[n_sign*i] = P3[plus][ns][i]; vout[n_sign*i+1] = P3[minus][ns][i]; } parallel_transport.run(n_sign*N, vout, vin, dir); } for(w=0; w<N; w++) for(ns=0; ns<n_sign; ns++){ force_product_sum(P3[plus][ns][w], Pnu[ns][w], GJP.staple3_coeff(), force[mu[w]]); } for(int r=n+1; r<n+4; r++){ // Loop over rho bool nextr = false; for(w=0; w<N; w++){ rho[w] = (r+w)%4; if(rho[w]==mu[w]){ nextr = true; break; } } if(nextr) continue; for(w=0; w<N; w++){ // sigma for(int s=rho[w]+1; s<rho[w]+4; s++){ sigma[w] = s%4; if(sigma[w]!=mu[w] && sigma[w]!=nu[w]) break; } } // Prhonu = U_rho Pnu for(int i=0; i<N; i++){ dir[n_sign*i] = n_sign*rho[i]+plus; dir[n_sign*i+1] = n_sign*rho[i]+minus; } for(ns=0; ns<n_sign; ns++){ for(int i=0; i<N; i++){ vin[n_sign*i] = vin[n_sign*i+1] = Pnu[ns][i]; vout[n_sign*i] = Prhonu[ns][minus][i]; vout[n_sign*i+1] = Prhonu[ns][plus][i]; } parallel_transport.run(n_sign*N, vout, vin, dir); } // P5 = U_mu Prhonu for(int i=0; i<N; i++){ dir[n_sign*i] = n_sign*mu[i]+plus; dir[n_sign*i+1] = n_sign*mu[i]+minus; } for(ns=0; ns<n_sign; ns++) for(rs=0; rs<n_sign; rs++) { for(int i=0; i<N; i++){ vin[n_sign*i] = vin[n_sign*i+1] = Prhonu[ns][rs][i]; vout[n_sign*i] = P5[plus][ns][rs][i]; vout[n_sign*i+1] = P5[minus][ns][rs][i]; } parallel_transport.run(n_sign*N, vout, vin, dir); } // F_mu += P5 Prhonu^dagger for(w=0; w<N; w++) for(ns=0; ns<n_sign; ns++) for(rs=0; rs<n_sign; rs++) force_product_sum(P5[plus][ns][rs][w], Prhonu[ns][rs][w], GJP.staple5_coeff(), force[mu[w]]); // Psigmarhonu = U_sigma P_rhonu for(int i=0; i<N; i++){ dir[n_sign*i] = (n_sign*sigma[i]); dir[n_sign*i+1] = (n_sign*sigma[i]+1); } for(ns=0; ns<n_sign; ns++) for(rs=0; rs<n_sign; rs++){ for(int i=0; i<N; i++){ vin[n_sign*i] = vin[n_sign*i+1] = Prhonu[ns][rs][i]; vout[n_sign*i] = Psigmarhonu[ns][rs][minus][i]; vout[n_sign*i+1] = Psigmarhonu[ns][rs][plus][i]; } parallel_transport.run(n_sign*N, vout, vin, dir); } // P7 = U_mu P_sigmarhonu for(int i=0; i<N; i++){ dir[n_sign*i] = n_sign*mu[i]+plus; dir[n_sign*i+1] = n_sign*mu[i]+minus; } for(ns=0; ns<n_sign; ns++) for(rs=0; rs<n_sign; rs++) for(ss=0; ss<n_sign; ss++){ for(int i=0; i<N; i++){ vin[n_sign*i] = vin[n_sign*i+1] = Psigmarhonu[ns][rs][ss][i]; vout[n_sign*i] = P7[plus][ns][rs][ss][i]; vout[n_sign*i+1] = P7[minus][ns][rs][ss][i]; } parallel_transport.run(n_sign*N, vout, vin, dir); } // F_mu -= P7 Psigmarhonu^\dagger for(w=0; w<N; w++) for(ns=0; ns<n_sign; ns++) for(rs=0; rs<n_sign; rs++) for(ss=0; ss<n_sign; ss++) force_product_sum(P7[plus][ns][rs][ss][w], Psigmarhonu[ns][rs][ss][w], GJP.staple7_coeff(), force[mu[w]]); // F_sigma += P7 Psigmarhonu^\dagger // N.B. this is the same as one of the previous products. for(w=0; w<N; w++) for(ns=0; ns<n_sign; ns++) for(rs=0; rs<n_sign; rs++) force_product_sum(P7[plus][ns][rs][minus][w], Psigmarhonu[ns][rs][minus][w], -GJP.staple7_coeff(), force[sigma[w]]); // F_sigma += Psigmarhonu P7^\dagger for(w=0; w<N; w++) for(ns=0; ns<n_sign; ns++) for(rs=0; rs<n_sign; rs++) force_product_sum(Psigmarhonu[ns][rs][minus][w], P7[minus][ns][rs][minus][w], -GJP.staple7_coeff(), force[sigma[w]]); // Psigma7 = U_sigma P7 for(int i=0; i<N; i++){ dir[n_sign*i] = (n_sign*sigma[i]); dir[n_sign*i+1] = (n_sign*sigma[i]+1); } for(ms=0; ms<n_sign; ms++) for(ns=0; ns<n_sign; ns++) for(rs=0; rs<n_sign; rs++){ for(int i=0; i<N; i++){ vin[n_sign*i] = P7[ms][ns][rs][plus][i]; vin[n_sign*i+1] = P7[ms][ns][rs][minus][i]; vout[n_sign*i] = Psigma7[ms][ns][rs][plus][i]; vout[n_sign*i+1] = Psigma7[ms][ns][rs][minus][i]; } parallel_transport.run(n_sign*N, vout, vin, dir); } // F_sigma += Fsigma7 Frhonu^\dagger for(w=0; w<N; w++) for(ns=0; ns<n_sign; ns++) for(rs=0; rs<n_sign; rs++) force_product_sum(Psigma7[plus][ns][rs][plus][w], Prhonu[ns][rs][w], -GJP.staple7_coeff(), force[sigma[w]]); // F_sigma += Frhonu Fsigma7^\dagger for(w=0; w<N; w++) for(ns=0; ns<n_sign; ns++) for(rs=0; rs<n_sign; rs++) force_product_sum(Prhonu[ns][rs][w], Psigma7[minus][ns][rs][plus][w], -GJP.staple7_coeff(), force[sigma[w]]); // P5 += c_7/c_5 Psigma7 if(GJP.staple5_coeff()!=0.0){ Float c75 = -GJP.staple7_coeff()/GJP.staple5_coeff(); for(ms=0; ms<n_sign; ms++) for(ns=0; ns<n_sign; ns++) for(rs=0; rs<n_sign; rs++) for(ss=0; ss<n_sign; ss++) for(w=0; w<N; w++) vaxpy3(P5[ms][ns][rs][w],&c75, Psigma7[ms][ns][rs][ss][w], P5[ms][ns][rs][w], vax_len); // P5[ms][ns][rs][w]->FTimesV1PlusV2(-GJP.staple7_coeff()/GJP.staple5_coeff(), Psigma7[ms][ns][rs][ss][w], P5[ms][ns][rs][w], GJP.VolNodeSites()*VECT_LEN); ForceFlops += 2*GJP.VolNodeSites()*VECT_LEN*N*n_sign*n_sign*n_sign*n_sign; } // F_rho -= P5 Prhonu^\dagger for(w=0; w<N; w++) for(ns=0; ns<n_sign; ns++) force_product_sum(P5[plus][ns][minus][w], Prhonu[ns][minus][w], -GJP.staple5_coeff(), force[rho[w]]); // F_rho -= Prhonu P5^\dagger for(w=0; w<N; w++) for(ns=0; ns<n_sign; ns++) force_product_sum(Prhonu[ns][minus][w], P5[minus][ns][minus][w], -GJP.staple5_coeff(), force[rho[w]]); // Prho5 = U_rho P5 for(int i=0; i<N; i++){ dir[n_sign*i] = n_sign*rho[i]+plus; dir[n_sign*i+1] = n_sign*rho[i]+minus; } for(ms=0; ms<n_sign; ms++) for(ns=0; ns<n_sign; ns++){ for(int i=0; i<N; i++){ vin[n_sign*i] = P5[ms][ns][plus][i]; vin[n_sign*i+1] = P5[ms][ns][minus][i]; vout[n_sign*i] = Prho5[ms][ns][plus][i]; vout[n_sign*i+1] = Prho5[ms][ns][minus][i]; } parallel_transport.run(n_sign*N, vout, vin, dir); } // F_rho -= Prho5 Pnu^\dagger for(w=0; w<N; w++) for(ns=0; ns<n_sign; ns++) force_product_sum(Prho5[plus][ns][plus][w], Pnu[ns][w], -GJP.staple5_coeff(), force[rho[w]]); // F_rho -= Pnu Prho5^\dagger for(w=0; w<N; w++) for(ns=0; ns<n_sign; ns++) force_product_sum(Pnu[ns][w], Prho5[minus][ns][plus][w], -GJP.staple5_coeff(), force[rho[w]]); // P3 += c_5/c_3 Prho5 if(GJP.staple3_coeff()!=0.0){ Float c53 = -GJP.staple5_coeff()/GJP.staple3_coeff(); for(ms=0; ms<n_sign; ms++) for(ns=0; ns<n_sign; ns++) for(rs=0; rs<n_sign; rs++) for(w=0; w<N; w++) vaxpy3(P3[ms][ns][w],&c53,Prho5[ms][ns][rs][w], P3[ms][ns][w], vax_len); // P3[ms][ns][w]->FTimesV1PlusV2(-GJP.staple5_coeff()/GJP.staple3_coeff(), Prho5[ms][ns][rs][w], P3[ms][ns][w], GJP.VolNodeSites()*VECT_LEN); ForceFlops += 2*GJP.VolNodeSites()*VECT_LEN*N*n_sign*n_sign*n_sign; } } // rho+sigma loop // Pnunu = U_nu Pnu for(int i=0; i<N; i++){ dir[n_sign*i] = n_sign*nu[i]+plus; dir[n_sign*i+1] = n_sign*nu[i]+minus; } for(int i=0; i<N; i++){ vin[n_sign*i] = Pnu[minus][i]; vin[n_sign*i+1] = Pnu[plus][i]; vout[n_sign*i] = Pnunu[minus][i]; vout[n_sign*i+1] = Pnunu[plus][i]; } parallel_transport.run(n_sign*N, vout, vin, dir); // P5 = U_mu Pnunu for(int i=0; i<N; i++){ dir[n_sign*i] = n_sign*mu[i]+plus; dir[n_sign*i+1] = n_sign*mu[i]+minus; } for(ns=0; ns<n_sign; ns++){ for(int i=0; i<N; i++){ vin[n_sign*i] = Pnunu[ns][i]; vin[n_sign*i+1] = Pnunu[ns][i]; vout[n_sign*i] = P5[plus][ns][0][i]; vout[n_sign*i+1] = P5[minus][ns][0][i]; } parallel_transport.run(n_sign*N, vout, vin, dir); } // F_mu += P5 Pnunu^\dagger for(w=0; w<N; w++) for(ns=0; ns<n_sign; ns++) force_product_sum(P5[plus][ns][0][w], Pnunu[ns][w], GJP.Lepage_coeff(), force[mu[w]]); // F_nu -= P5 Pnunu^\dagger // N.B. this is the same as one of the previous products for(w=0; w<N; w++) force_product_sum(P5[plus][minus][0][w], Pnunu[minus][w], -GJP.Lepage_coeff(), force[nu[w]]); // F_nu -= Pnunu P5^\dagger for(w=0; w<N; w++) force_product_sum(Pnunu[minus][w], P5[minus][minus][0][w], -GJP.Lepage_coeff(), force[nu[w]]); // Pnu5 = U_nu P5 for(int i=0; i<N; i++){ dir[n_sign*i] = n_sign*nu[i]+plus; dir[n_sign*i+1] = n_sign*nu[i]+minus; } for(ms=0; ms<n_sign; ms++){ for(int i=0; i<N; i++){ vin[n_sign*i] = P5[ms][plus][0][i]; vin[n_sign*i+1] = P5[ms][minus][0][i]; vout[n_sign*i] = Pnu5[ms][plus][i]; vout[n_sign*i+1] = Pnu5[ms][minus][i]; } parallel_transport.run(n_sign*N, vout, vin, dir); } // F_nu -= Pnu5 Pnu^\dagger for(w=0; w<N; w++) force_product_sum(Pnu5[plus][plus][w], Pnu[plus][w], -GJP.Lepage_coeff(), force[nu[w]]); // F_nu -= Pnu Pnu5^\dagger for(w=0; w<N; w++) force_product_sum(Pnu[plus][w], Pnu5[minus][plus][w], -GJP.Lepage_coeff(), force[nu[w]]); // P3 += c_L/c_3 Pnu5 if(GJP.staple3_coeff()!=0.0){ Float cl3 = -GJP.Lepage_coeff()/GJP.staple3_coeff(); for(ms=0; ms<n_sign; ms++) for(ns=0; ns<n_sign; ns++) for(w=0; w<N; w++) vaxpy3(P3[ms][ns][w],&cl3,Pnu5[ms][ns][w],P3[ms][ns][w], vax_len); // P3[ms][ns][w]->FTimesV1PlusV2(-GJP.Lepage_coeff()/GJP.staple3_coeff(), Pnu5[ms][ns][w], P3[ms][ns][w], GJP.VolNodeSites()*VECT_LEN); ForceFlops += 2*GJP.VolNodeSites()*VECT_LEN*N*n_sign*n_sign; } // F_nu += P3 Pnu^\dagger for(w=0; w<N; w++) force_product_sum(P3[plus][minus][w], Pnu[minus][w], -GJP.staple3_coeff(), force[nu[w]]); // F_nu += Pnu P3^\dagger for(w=0; w<N; w++) force_product_sum(Pnu[minus][w], P3[minus][minus][w], -GJP.staple3_coeff(), force[nu[w]]); // Pnu3 = U_nu P3 for(int i=0; i<N; i++) dir[i] = n_sign*nu[i]+plus; for(ms=0; ms<n_sign; ms++){ for(int i=0; i<N; i++){ vin[i] = P3[ms][plus][i]; vout[i] = Pnu3[ms][plus][i]; } parallel_transport.run(N, vout, vin, dir); } // F_nu += Pnu3 X^\dagger for(w=0; w<N; w++) force_product_sum(Pnu3[plus][plus][w], X, -GJP.staple3_coeff(), force[nu[w]]); // F_nu += X Pnu3^\dagger for(w=0; w<N; w++) force_product_sum(X, Pnu3[minus][plus][w], -GJP.staple3_coeff(), force[nu[w]]); // This stuff is to be done once only for each value of nu[w]. // Look for N nu's that haven't been done before. bool nextn = false; for(w=0; w<N; w++) if(done[nu[w]]){ nextn = true; break; } if(nextn) continue; for(w=0; w<N; w++) done[nu[w]] = true; // Got N new nu's, so do some stuff... // F_nu += Pnu X^\dagger for(w=0; w<N; w++) force_product_sum(Pnu[minus][w], X, GJP.KS_coeff(), force[nu[w]]); // F_nu += Pnunu Pnu^\dagger for(w=0; w<N; w++) force_product_sum(Pnunu[minus][w], Pnu[plus][w], -GJP.Naik_coeff(), force[nu[w]]); // F_nu += Pnu Pnunu^\dagger for(w=0; w<N; w++) force_product_sum(Pnu[minus][w], Pnunu[plus][w], GJP.Naik_coeff(), force[nu[w]]); // Pnununu = U_nu Pnunu for(int i=0; i<N; i++){ dir[i] = n_sign*nu[i]+plus; vin[i] = Pnunu[minus][i]; vout[i] = Pnununu[i]; } parallel_transport.run(N, vout, vin, dir); // F_nu += Pnununu X^\dagger for(w=0; w<N; w++) force_product_sum(Pnununu[w], X, GJP.Naik_coeff(), force[nu[w]]); } // nu loop } // mu loop // Now that we have computed the force, we can update the momenta // nflops +=ParTrans::PTflops + ForceFlops; #ifdef PROFILE dtime += dclock(); int nflops = ParTrans::PTflops + ForceFlops; printf("%s:%s:",cname,fname); print_flops(nflops,dtime); #endif Fdt = update_momenta(force, dt, mom); // Tidy up #if 0 sfree(Pnu); sfree(P3); sfree(Prhonu); sfree(P5); sfree(P7); sfree(Psigma7); #else for(int w = 0;w<N;w++){ for(int i = 0;i<n_sign;i++){ v_free(Pnu[i][w]); for(int j = 0;j<n_sign;j++){ v_free(P3[i][j][w]); v_free(Prhonu[i][j][w]); for(int k = 0;k<n_sign;k++){ v_free(P5[i][j][k][w]); for(int l = 0;l<n_sign;l++){ v_free(P7[i][j][k][l][w]); v_free(Psigma7[i][j][k][l][w]); } } } } } #endif for(int i = 0;i<4;i++) v_free(force[i]); sfree(X); Convert(CANONICAL); #endif return Fdt; }
ForceArg Gwilson::EvolveMomGforce(Matrix *mom, Float dt){ char *fname = "EvolveMomGforce(M*,F)"; VRB.Func(cname,fname); static Matrix mt0; static Matrix *mp0 = &mt0; Float L1=0.0; Float L2=0.0; Float Linf=0.0; #ifdef PROFILE Float time = -dclock(); ForceFlops=0; ParTrans::PTflops=0; #endif static int vol = GJP.VolNodeSites(); const int N = 4; Float tmp = GJP.Beta() *invs3; Matrix *Unit = (Matrix *) fmalloc(vol*sizeof(Matrix)); Matrix *tmp1[N]; Matrix *tmp2[N]; Matrix *result[4]; for(int i = 0;i<4;i++){ result[i] = (Matrix *) fmalloc(vol*sizeof(Matrix)); } for(int i = 0;i<N;i++){ tmp1[i] = (Matrix *) fmalloc(vol*sizeof(Matrix)); tmp2[i] = (Matrix *) fmalloc(vol*sizeof(Matrix)); bzero((char *)tmp2[i],vol*sizeof(Matrix)); } for(int i = 0;i<vol;i++) Unit[i]=1.; Matrix *Units[4]; for(int i = 0;i<N;i++) Units[i] = Unit; int mu,nu; { int dirs_p[] = {0,2,4,6,0,2,4}; int dirs_m[] = {1,3,5,7,1,3,5}; ParTransGauge pt(*this); for(nu = 1;nu<4;nu++){ pt.run(N,tmp1,Units,dirs_m+nu); pt.run(N,result,tmp1,dirs_m); pt.run(N,tmp1,result,dirs_p+nu); for(int i = 0; i<N;i++) vaxpy3_m(tmp2[i],&tmp,tmp1[i],tmp2[i],vol*3); pt.run(N,tmp1,Units,dirs_p+nu); pt.run(N,result,tmp1,dirs_m); pt.run(N,tmp1,result,dirs_m+nu); for(int i = 0; i<N;i++) vaxpy3_m(tmp2[i],&tmp,tmp1[i],tmp2[i],vol*3); ForceFlops +=vol*12*N; } pt.run(N,result,tmp2,dirs_p); } Matrix mp1; for(mu = 0;mu<4;mu++){ Matrix *mtmp = result[mu]; for(int i = 0;i<vol;i++) { mtmp->TrLessAntiHermMatrix(); mtmp++; } } ForceFlops += vol*60; int x[4]; for(x[0] = 0; x[0] < GJP.XnodeSites(); ++x[0]) { for(x[1] = 0; x[1] < GJP.YnodeSites(); ++x[1]) { for(x[2] = 0; x[2] < GJP.ZnodeSites(); ++x[2]) { for(x[3] = 0; x[3] < GJP.TnodeSites(); ++x[3]) { int uoff = GsiteOffset(x); for (int mu = 0; mu < 4; ++mu) { IFloat *ihp = (IFloat *)(mom+uoff+mu); IFloat *dotp2 = (IFloat *) (result[mu]+(uoff/4)); fTimesV1PlusV2(ihp, dt, dotp2, ihp, 18); Float norm = ((Matrix*)dotp2)->norm(); Float tmp = sqrt(norm); L1 += tmp; L2 += norm; Linf = (tmp>Linf ? tmp : Linf); } } } } } ForceFlops += vol*144; #ifdef PROFILE time += dclock(); print_flops(cname,fname,ForceFlops+ParTrans::PTflops,time); #endif ffree(Unit); for(int i = 0;i<N;i++){ ffree(tmp1[i]); ffree(tmp2[i]); } for(int i = 0;i<4;i++) ffree(result[i]); glb_sum(&L1); glb_sum(&L2); glb_max(&Linf); L1 /= 4.0*GJP.VolSites(); L2 /= 4.0*GJP.VolSites(); return ForceArg(dt*L1, dt*sqrt(L2), dt*Linf); }
int DiracOpWilson::QudaInvert(Vector *out, Vector *in, Float *true_res, int mat_type) { char *fname = "QudaInvert(V*, V*, F*, int)"; VRB.ActivateLevel(VERBOSE_FLOW_LEVEL); struct timeval start, end; gettimeofday(&start,NULL); QudaGaugeParam gauge_param = newQudaGaugeParam(); QudaInvertParam inv_param = newQudaInvertParam(); int f_size_cb = GJP.VolNodeSites() * lat.FsiteSize() / 2; //-------------------------------------- // Parameter setting for Gauge Data //-------------------------------------- // set the CUDA precisions gauge_param.reconstruct = setReconstruct_wil(QudaParam.reconstruct); gauge_param.cuda_prec = setPrecision_wil(QudaParam.gauge_prec); // set the CUDA sloppy precisions gauge_param.reconstruct_sloppy = setReconstruct_wil(QudaParam.reconstruct_sloppy); gauge_param.cuda_prec_sloppy = setPrecision_wil(QudaParam.gauge_prec_sloppy); if (sizeof(Float) == sizeof(double)) { gauge_param.cpu_prec = QUDA_DOUBLE_PRECISION; inv_param.cpu_prec = QUDA_DOUBLE_PRECISION; } else { gauge_param.cpu_prec = QUDA_SINGLE_PRECISION; inv_param.cpu_prec = QUDA_SINGLE_PRECISION; } gauge_param.X[0] = GJP.XnodeSites(); gauge_param.X[1] = GJP.YnodeSites(); gauge_param.X[2] = GJP.ZnodeSites(); gauge_param.X[3] = GJP.TnodeSites(); gauge_param.anisotropy = GJP.XiBare(); gauge_param.cuda_prec_precondition = QUDA_DOUBLE_PRECISION; gauge_param.reconstruct_precondition = setReconstruct_wil(QudaParam.reconstruct_sloppy); if (GJP.XiDir() != 3) ERR.General(cname, fname, "Anisotropy direction not supported\n"); //--------------------------------------------------- // QUDA_FLOAT_GAUGE_ORDER = 1 // QUDA_FLOAT2_GAUGE_ORDER = 2, // no reconstruct and double precision // QUDA_FLOAT4_GAUGE_ORDER = 4, // 8 and 12 reconstruct half and single // QUDA_QDP_GAUGE_ORDER, // expect *gauge[4], even-odd, row-column color // QUDA_CPS_WILSON_GAUGE_ORDER, // expect *gauge, even-odd, mu inside, column-row color // QUDA_MILC_GAUGE_ORDER, // expect *gauge, even-odd, mu inside, row-column order // // MULTI GPU case, we have to use QDP format of gauge data // //--------------------------------------------------- gauge_param.gauge_order = QUDA_CPS_WILSON_GAUGE_ORDER; //--------------------------------------------------- gauge_param.gauge_fix = QUDA_GAUGE_FIXED_NO; gauge_param.type = QUDA_WILSON_LINKS; for (int d=0; d<3; d++) if (GJP.Bc(d) != BND_CND_PRD) ERR.General(cname, fname, "Boundary condition not supported\n"); if (GJP.Tbc() == BND_CND_PRD) gauge_param.t_boundary = QUDA_PERIODIC_T; else gauge_param.t_boundary = QUDA_ANTI_PERIODIC_T; //------------------------------------------ // Parameter setting for Matrix invertion //------------------------------------------ inv_param.cuda_prec = setPrecision_wil(QudaParam.spinor_prec); inv_param.cuda_prec_sloppy = setPrecision_wil(QudaParam.spinor_prec_sloppy); inv_param.maxiter = dirac_arg->max_num_iter; inv_param.reliable_delta = QudaParam.reliable_delta; inv_param.Ls = 1; //inv_param.Ls = GJP.SnodeSites(); //-------------------------- // Possible dslash type //-------------------------- // QUDA_WILSON_DSLASH // QUDA_CLOVER_WILSON_DSLASH // QUDA_DOMAIN_WALL_DSLASH // QUDA_ASQTAD_DSLASH // QUDA_TWISTED_MASS_DSLASH //-------------------------- inv_param.dslash_type = QUDA_WILSON_DSLASH; //-------------------------------- // Possible normalization method //-------------------------------- // QUDA_KAPPA_NORMALIZATION // QUDA_MASS_NORMALIZATION // QUDA_ASYMMETRIC_MASS_NORMALIZATION //-------------------------------- inv_param.kappa = kappa; inv_param.mass_normalization = QUDA_KAPPA_NORMALIZATION; //inv_param.mass = dirac_arg->mass; //inv_param.mass_normalization = QUDA_MASS_NORMALIZATION; inv_param.dagger = QUDA_DAG_NO; switch (mat_type) { case 0: inv_param.solution_type = QUDA_MATPC_SOLUTION; break; case 1: inv_param.solution_type = QUDA_MATPCDAG_MATPC_SOLUTION; break; default: ERR.General(cname, fname, "Matrix solution type not defined\n"); } inv_param.matpc_type = QUDA_MATPC_ODD_ODD; //inv_param.matpc_type = QUDA_MATPC_EVEN_EVEN; inv_param.preserve_source = QUDA_PRESERVE_SOURCE_NO; inv_param.gamma_basis = QUDA_DEGRAND_ROSSI_GAMMA_BASIS; //inv_param.gamma_basis = QUDA_UKQCD_GAMMA_BASIS; inv_param.dirac_order = QUDA_CPS_WILSON_DIRAC_ORDER; //inv_param.dirac_order = QUDA_DIRAC_ORDER; inv_param.input_location = QUDA_CPU_FIELD_LOCATION; inv_param.output_location = QUDA_CPU_FIELD_LOCATION; inv_param.tune = QUDA_TUNE_NO; inv_param.use_init_guess = QUDA_USE_INIT_GUESS_YES; //-------------------------- // Possible verbose type //-------------------------- // QUDA_SILENT // QUDA_SUMMARIZE // QUDA_VERBOSE // QUDA_DEBUG_VERBOSE //-------------------------- inv_param.verbosity = QUDA_VERBOSE; switch (dirac_arg->Inverter) { case CG: inv_param.inv_type = QUDA_CG_INVERTER; inv_param.solve_type = QUDA_NORMEQ_PC_SOLVE; break; case BICGSTAB: inv_param.inv_type = QUDA_BICGSTAB_INVERTER; inv_param.solve_type = QUDA_DIRECT_PC_SOLVE; break; default: inv_param.inv_type = QUDA_CG_INVERTER; inv_param.solve_type = QUDA_NORMEQ_PC_SOLVE; break; } // domain decomposition preconditioner parameters inv_param.inv_type_precondition = QUDA_INVALID_INVERTER; inv_param.schwarz_type = QUDA_ADDITIVE_SCHWARZ; inv_param.precondition_cycle = 1; inv_param.tol_precondition = 1e-1; inv_param.maxiter_precondition = 10; inv_param.verbosity_precondition = QUDA_VERBOSE; inv_param.prec_precondition = QUDA_HALF_PRECISION; inv_param.omega = 1.0; gauge_param.ga_pad = 0; // 24*24*24/2; inv_param.sp_pad = 0; // 24*24*24/2; inv_param.cl_pad = 0; // 24*24*24/2; #ifdef USE_QMP //------------------------------------------ // This part is needed to make buffer memory // space for multi GPU Comm. //------------------------------------------ int x_face_size = gauge_param.X[1]*gauge_param.X[2]*gauge_param.X[3]/2; int y_face_size = gauge_param.X[0]*gauge_param.X[2]*gauge_param.X[3]/2; int z_face_size = gauge_param.X[0]*gauge_param.X[1]*gauge_param.X[3]/2; int t_face_size = gauge_param.X[0]*gauge_param.X[1]*gauge_param.X[2]/2; int pad_size =MAX(x_face_size, y_face_size); pad_size = MAX(pad_size, z_face_size); pad_size = MAX(pad_size, t_face_size); gauge_param.ga_pad = pad_size; #endif loadGaugeQuda((void*)gauge_field, &gauge_param); Vector *x = (Vector*)smalloc(f_size_cb * sizeof(Float)); Vector *r = (Vector*)smalloc(f_size_cb * sizeof(Float)); x->VecZero(f_size_cb); r->VecZero(f_size_cb); //---------------------------------------------- // Calculate Flops value //---------------------------------------------- Float flops = 0.0; Float matvec_flops = (2*1320+48)*GJP.VolNodeSites()/2; if (mat_type == 1) matvec_flops *= 2; // double flops since normal equations //---------------------------------------------- //---------------------------------------------- // Calculate Stop condition //---------------------------------------------- Float in_norm2 = in->NormSqGlbSum(f_size_cb); Float stop = dirac_arg->stop_rsd * dirac_arg->stop_rsd * in_norm2; int total_iter = 0, k = 0; // Initial residual if (mat_type == 0) { MatPc(r,out); } else { MatPcDagMatPc(r,out); } r->FTimesV1MinusV2(1.0,in,r,f_size_cb); Float r2 = r->NormSqGlbSum(f_size_cb); flops += 4*f_size_cb + matvec_flops; VRB.Flow(cname, fname, "0 iterations, res^2 = %1.15e, restart = 0\n", r2); while (r2 > stop && k < QudaParam.max_restart) { inv_param.tol = dirac_arg->stop_rsd; if(sqrt(stop/r2)>inv_param.tol) { inv_param.tol = sqrt(stop/r2); } x->VecZero(f_size_cb); //--------------------------------- // Inversion sequence start //--------------------------------- invertQuda(x, r, &inv_param); // Update solution out->VecAddEquVec(x, f_size_cb); //------------------------------------ // Calculate new residual if (mat_type == 0) MatPc(r, out); else MatPcDagMatPc(r, out); r->FTimesV1MinusV2(1.0,in,r,f_size_cb); r2 = r->NormSqGlbSum(f_size_cb); //------------------------------------ k++; total_iter += inv_param.iter + 1; flops += 1e9*inv_param.gflops + 8*f_size_cb + matvec_flops; VRB.Flow(cname, fname, "Gflops = %e, Seconds = %e, Gflops/s = %f\n", inv_param.gflops, inv_param.secs, inv_param.gflops / inv_param.secs); VRB.Flow(cname, fname, "True |res| / |src| = %1.15e, iter = %d, restart = %d\n", sqrt(r2)/sqrt(in_norm2), total_iter, k); } gettimeofday(&end,NULL); print_flops(cname,fname,flops,&start,&end); VRB.Flow(cname, fname, "Cuda Space Required. Spinor:%f + Gauge:%f GiB\n", inv_param.spinorGiB, gauge_param.gaugeGiB); VRB.Flow(cname, fname, "True |res| / |src| = %1.15e, iter = %d, restart = %d\n", sqrt(r2)/sqrt(in_norm2), total_iter, k); if (true_res) *true_res = sqrt(r2); //---------------------------------------- // Finalize QUDA memory and API //---------------------------------------- freeGaugeQuda(); //---------------------------------------- sfree(x); sfree(r); //VRB.DeactivateLevel(VERBOSE_FLOW_LEVEL); return total_iter; }
void PT::vec_cb_norm(int n, IFloat **vout, IFloat **vin, const int *dir,int parity, IFloat * gauge) { //List of the different directions int wire[n]; int i; // int j,d,s,k; //SCUDirArgs for sending and receiving in the n directions SCUDirArgIR *SCUarg_p[2*non_local_dirs]; //SCUDirArgIR *SCUarg_p[2*n]; SCUDirArgMulti SCUmulti; static int call_num = 0; int vlen = VECT_LEN; int vlen2 = VECT_LEN; // printf("gauge=%p\n",gauge); call_num++; //Name our function // char *fname="pt_1vec_cb_norm()"; //Set the transfer directions //If wire[i] is even, then we have communication in the negative direction //If wire[i] is odd, then we have communication in the positive direction for(i=0;i<n;i++) wire[i]=dir[i]; Float dtime; //If wire[i] is odd, then we have parallel transport in the //positive direction. In this case, the matrix multiplication is //done before the field is transferred over to the adjacent node // //If we have transfer in the negative T direction (wire[i] = 6) then //we have to copy the appropriate fields into the send buffer if(conjugated) for(i=0;i<n;i++) { if(!local[wire[i]/2]) { if(wire[i]%2) { //printf("dir = %d, pre-mulitply\n", wire[i]); #ifdef PROFILE dtime = - dclock(); #endif partrans_cmv(non_local_chi_cb[wire[i]]/2,uc_nl_cb_pre[parity][wire[i]/2],gauge,vin[i],snd_buf_cb[wire[i]/2]); #ifdef PROFILE dtime +=dclock(); print_flops("",fname,66*non_local_chi_cb[wire[i]],dtime); #endif } else if((wire[i] == 6)) { #ifdef PROFILE dtime = - dclock(); #endif #if 1 pt_copy_buffer(non_local_chi_cb[6],(long)vin[i],(long)snd_buf_t_cb,(long)Toffset[parity]); #else for(j = 0; j < non_local_chi_cb[6];j++) for(k = 0; k < VECT_LEN;k++) *(snd_buf_t_cb+j*VECT_LEN+k) = *(vin[i] + *(Toffset[parity]+j)+ k); //moveMem(snd_buf_t_cb + j*VECT_LEN,vin[i] + *(Toffset[parity]+j)*vlen,VECT_LEN*sizeof(IFloat)); #endif #ifdef PROFILE dtime +=dclock(); print_flops(fname,"pt_copy_buffer()",0,dtime); #endif } } } else for(i=0;i<n;i++) { if(!local[wire[i]/2]) { if(wire[i]%2) { #ifdef PROFILE dtime = - dclock(); #endif partrans_cmv_dag(non_local_chi_cb[wire[i]]/2,uc_nl_cb_pre[parity][wire[i]/2],gauge,vin[i],snd_buf_cb[wire[i]/2]); #ifdef PROFILE dtime +=dclock(); print_flops("",fname,66*non_local_chi_cb[wire[i]],dtime); #endif } else if(wire[i] == 6) { #ifdef PROFILE dtime = - dclock(); #endif #if 1 pt_copy_buffer(non_local_chi_cb[6],(long)vin[i],(long)snd_buf_t_cb,(long)Toffset[parity]); #else for(j = 0; j < non_local_chi_cb[6];j++) for(k = 0; k < VECT_LEN;k++) *(snd_buf_t_cb+j*VECT_LEN+k) = *(vin[i] + *(Toffset[parity]+j)+ k); // moveMem(snd_buf_t_cb + j*VECT_LEN,vin[i] + *(Toffset[parity]+j),VECT_LEN*sizeof(IFloat)); #endif #ifdef PROFILE dtime +=dclock(); print_flops(fname,"pt_copy_buffer()",0,dtime); #endif } } } #ifdef PROFILE dtime = - dclock(); #endif #if 1 int comms = 0; for(i=0;i<n;i++) { if(!local[wire[i]/2]) { //Calculate the starting address for the data to be sent IFloat *addr = vin[i] + VECT_LEN * offset_cb[wire[i]]; //This points to the appropriate SCUDirArg for receiving SCUarg_p[2*comms] = SCUarg_cb[2*wire[i]]; //This points to the appropriate SCUDirArg for sending SCUarg_p[2*comms+1] = SCUarg_cb[2*wire[i]+1]; //Set the send address if(wire[i]%2) SCUarg_p[2*comms+1]->Addr((void *)snd_buf_cb[wire[i]/2]); else if(wire[i] == 6) SCUarg_p[2*comms+1]->Addr((void *)snd_buf_t_cb); else SCUarg_p[2*comms+1]->Addr((void *)addr); comms++; } } #endif if(comms){ SCUmulti.Init(SCUarg_p,2*comms); //Begin transmission SCUmulti.SlowStartTrans(); } //Do local calculations if(conjugated) { for(i=0;i<n;i++) { #ifdef PROFILE dtime = - dclock(); #endif if(wire[i]%2) partrans_cmv(local_chi_cb[wire[i]]/2,uc_l_cb[parity][wire[i]],gauge,vin[i],vout[i]); else partrans_cmv_dag(local_chi_cb[wire[i]]/2,uc_l_cb[parity][wire[i]],gauge,vin[i],vout[i]); #ifdef PROFILE dtime +=dclock(); print_flops("",fname,66*local_chi_cb[wire[i]],dtime); #endif } } else { for(i=0;i<n;i++) { #ifdef PROFILE dtime = - dclock(); #endif if(!(wire[i]%2)) partrans_cmv(local_chi_cb[wire[i]]/2,uc_l_cb[parity][wire[i]],gauge,vin[i],vout[i]); else partrans_cmv_dag(local_chi_cb[wire[i]]/2,uc_l_cb[parity][wire[i]],gauge,vin[i],vout[i]); #ifdef PROFILE dtime +=dclock(); print_flops("",fname,66*local_chi_cb[wire[i]],dtime); #endif } } //End transmission if(comms){ SCUmulti.TransComplete(); } //If wire[i] is even, then we have transport in the negative direction. //In this case, the vector field is multiplied by the SU(3) link matrix //after all communication is complete IFloat *fp0, *fp1; for(i=0;i<n;i++) { if(!local[wire[i]/2]) { if(!(wire[i]%2)) { #ifdef PROFILE dtime = - dclock(); #endif if(conjugated) partrans_cmv_dag(non_local_chi_cb[wire[i]]/2,uc_nl_cb[parity][wire[i]],gauge,rcv_buf[wire[i]],vout[i]); else partrans_cmv(non_local_chi_cb[wire[i]]/2,uc_nl_cb[parity][wire[i]],gauge,rcv_buf[wire[i]],vout[i]); #ifdef PROFILE dtime +=dclock(); print_flops("",fname,66*non_local_chi_cb[wire[i]],dtime); #endif } //Otherwise we have parallel transport in the positive direction. //In this case, the received data has already been pre-multiplied //All we need to do is to put the transported field in the correct place else { #ifdef PROFILE dtime = - dclock(); #endif #if 1 pt_copy(non_local_chi_cb[wire[i]]/2,uc_nl_cb[parity][wire[i]],rcv_buf[wire[i]],vout[i]); #else //Place the data in the receive buffer into the result vector for(s=0;s<non_local_chi_cb[wire[i]];s++) { fp0 = (IFloat *)((long)rcv_buf[wire[i]]+uc_nl_cb[parity][wire[i]][s].src); fp1 = (IFloat *)((long)vout[i]+uc_nl_cb[parity][wire[i]][s].dest); for(d = 0;d<VECT_LEN;d++) *(fp1+d) = *(fp0+d); //moveMem(fp1,fp0,VECT_LEN*sizeof(IFloat)); } #endif #ifdef PROFILE dtime +=dclock(); print_flops(fname,"pt_copy()",0,dtime); #endif } } } // ParTrans::PTflops +=33*n*vol; }
// CJ: change start //------------------------------------------------------------------ // EvolveMomFforce(Matrix *mom, Vector *chi, Float mass, // Float dt): // It evolves the canonical momentum mom by dt // using the fermion force. //------------------------------------------------------------------ ForceArg FdwfBase::EvolveMomFforce(Matrix *mom, Vector *chi, Float mass, Float dt){ char *fname = "EvolveMomFforce(M*,V*,F,F,F)"; VRB.Func(cname,fname); Matrix *gauge = GaugeField() ; if (Colors() != 3) ERR.General(cname,fname,"Wrong nbr of colors.") ; if (SpinComponents() != 4) ERR.General(cname,fname,"Wrong nbr of spin comp.") ; if (mom == 0) ERR.Pointer(cname,fname,"mom") ; if (chi == 0) ERR.Pointer(cname,fname,"chi") ; //---------------------------------------------------------------- // allocate space for two CANONICAL fermion fields //---------------------------------------------------------------- int f_size = FsiteSize() * GJP.VolNodeSites() ; int f_site_size_4d = 2 * Colors() * SpinComponents(); int f_size_4d = f_site_size_4d * GJP.VolNodeSites() ; char *str_v1 = "v1" ; Vector *v1 = (Vector *)fmalloc(cname,fname,str_v1,f_size*sizeof(Float)); // if (v1 == 0) ERR.Pointer(cname, fname, str_v1) ; // VRB.Smalloc(cname, fname, str_v1, v1, f_size*sizeof(Float)) ; char *str_v2 = "v2" ; Vector *v2 = (Vector *)fmalloc(cname,fname,str_v2,f_size*sizeof(Float)) ; // if (v2 == 0) ERR.Pointer(cname, fname, str_v2) ; // VRB.Smalloc(cname, fname, str_v2, v2, f_size*sizeof(Float)) ; // LatMatrix MomDiff(QFAST,4); // Matrix *mom_diff = MomDiff.Mat(); Float L1 = 0.0; Float L2 = 0.0; Float Linf = 0.0; //---------------------------------------------------------------- // Calculate v1, v2. Both v1, v2 must be in CANONICAL order after // the calculation. //---------------------------------------------------------------- VRB.Clock(cname, fname, "Before calc force vecs.\n") ; #ifdef PROFILE Float time = -dclock(); ForceFlops=0; #endif { CgArg cg_arg ; cg_arg.mass = mass ; DiracOpDwf dwf(*this, v1, v2, &cg_arg, CNV_FRM_NO) ; dwf.CalcHmdForceVecs(chi) ; Fconvert(v1,CANONICAL,WILSON); Fconvert(v2,CANONICAL,WILSON); } #ifdef PROFILE time += dclock(); print_flops(cname,fname,ForceFlops,time); #endif int mu, x, y, z, t, s, lx, ly, lz, lt, ls ; int size[5],surf[4],blklen[4],stride[4],numblk[4]; size[0] = lx = GJP.XnodeSites() ; size[1] = ly = GJP.YnodeSites() ; size[2] = lz = GJP.ZnodeSites() ; size[3] = lt = GJP.TnodeSites() ; size[4] = ls = GJP.SnodeSites() ; blklen[0] = sizeof(Float)*FsiteSize()/size[4]; numblk[0] = GJP.VolNodeSites()/size[0]*size[4]; stride[0] = blklen[0] * (size[0]-1); for (int i =1;i<4;i++){ blklen[i] = blklen[i-1] * size[i-1]; numblk[i] = numblk[i-1] / size[i]; stride[i] = blklen[i] * (size[i]-1); } for (int i =0;i<4;i++){ surf[i] = GJP.VolNodeSites()/size[i]; } //---------------------------------------------------------------- // allocate buffer space for two fermion fields that are assoc // with only one 4-D site. //---------------------------------------------------------------- unsigned long flag = QFAST; if (ls<4) flag|= QNONCACHE; char *str_site_v1 = "site_v1" ; char *str_site_v2 = "site_v2" ; Float *v1_buf[4],*v2_buf[4]; int pos[4]; Float *v1_buf_pos[4]; Float *v2_buf_pos[4]; for(int i =0;i<4;i++) { v1_buf[i]=(Float *)fmalloc(cname,fname,"v1_buf",surf[i]*FsiteSize()*sizeof(Float)) ; v2_buf[i]=(Float *)fmalloc(cname,fname,"v2_buf",surf[i]*FsiteSize()*sizeof(Float)) ; } // Matrix tmp_mat1, tmp_mat2 ; Matrix *tmp_mat1,*tmp_mat2; tmp_mat1 = (Matrix *)fmalloc(cname,fname,"tmp_mat1",sizeof(Matrix)*2); tmp_mat2 = tmp_mat1+1; SCUDirArgIR Send[4]; SCUDirArgIR Recv[4]; SCUDMAInst *dma[2]; for(int i = 0;i<2;i++) dma[i] = new SCUDMAInst; void *addr[2]; int f_bytes = sizeof(Float)*f_site_size_4d; int st_bytes = sizeof(Float)*f_size_4d - f_bytes; for (mu=0; mu<4; mu++){ if ( GJP.Nodes(mu) >1){ dma[0]->Init(v1_buf[mu],surf[mu]*ls*f_bytes); dma[1]->Init(v2_buf[mu],surf[mu]*ls*f_bytes); Recv[mu].Init(gjp_scu_dir[2*mu],SCU_REC,dma,2); dma[0]->Init(v1,blklen[mu],numblk[mu],stride[mu]); dma[1]->Init(v2,blklen[mu],numblk[mu],stride[mu]); Send[mu].Init(gjp_scu_dir[2*mu+1],SCU_SEND,dma,2); } } for(int i = 0;i<2;i++) delete dma[i]; VRB.Clock(cname, fname, "Before loop over links.\n") ; #ifdef PROFILE time = -dclock(); ForceFlops=0; #endif sys_cacheflush(0); for (mu=0; mu<4; mu++){ if ( GJP.Nodes(mu) >1){ Recv[mu].StartTrans(); Send[mu].StartTrans(); } } for (mu=0; mu<4; mu++){ for (pos[3]=0; pos[3]<size[3]; pos[3]++){ for (pos[2]=0; pos[2]<size[2]; pos[2]++){ for (pos[1]=0; pos[1]<size[1]; pos[1]++){ for (pos[0]=0; pos[0]<size[0]; pos[0]++){ int gauge_offset = offset(size,pos); int vec_offset = f_site_size_4d*gauge_offset ; gauge_offset = mu+4*gauge_offset ; // printf("%d %d %d %d %d\n",mu,pos[0],pos[1],pos[2],pos[3]); Float *v1_plus_mu ; Float *v2_plus_mu ; int vec_plus_mu_stride ; int vec_plus_mu_offset = f_site_size_4d ; Float coeff = -2.0 * dt ; vec_plus_mu_offset *= offset(size,pos,mu); if ((GJP.Nodes(mu)>1)&&((pos[mu]+1) == size[mu]) ) { } else { v1_plus_mu = (Float *)v1+vec_plus_mu_offset ; v2_plus_mu = (Float *)v2+vec_plus_mu_offset ; vec_plus_mu_stride = f_size_4d - f_site_size_4d ; if ((pos[mu]+1) == size[mu]) if (GJP.NodeBc(mu)==BND_CND_APRD) coeff = -coeff ; // Float time2 = -dclock(); sproj_tr[mu]( (IFloat *)tmp_mat1, (IFloat *)v1_plus_mu, (IFloat *)v2+vec_offset, ls, vec_plus_mu_stride, f_size_4d-f_site_size_4d) ; sproj_tr[mu+4]( (IFloat *)tmp_mat2, (IFloat *)v2_plus_mu, (IFloat *)v1+vec_offset, ls, vec_plus_mu_stride, f_size_4d-f_site_size_4d) ; // time2 += dclock(); // print_flops("","sproj",2*9*16*ls,time2); *tmp_mat1 += *tmp_mat2 ; // If GJP.Snodes > 1 sum up contributions from all s nodes if(GJP.Snodes() > 1) { glb_sum_multi_dir((Float *)tmp_mat1, 4, sizeof(Matrix)/sizeof(IFloat) ) ; } tmp_mat2->DotMEqual(*(gauge+gauge_offset), *tmp_mat1) ; tmp_mat1->Dagger(*tmp_mat2) ; tmp_mat2->TrLessAntiHermMatrix(*tmp_mat1) ; *tmp_mat2 *= coeff ; *(mom+gauge_offset) += *tmp_mat2 ; Float norm = tmp_mat2->norm(); Float tmp = sqrt(norm); L1 += tmp; L2 += norm; Linf = (tmp>Linf ? tmp : Linf); } } } } } // end for x,y,z,t } // end for mu for (mu=0; mu<4; mu++){ if ( GJP.Nodes(mu) >1){ Recv[mu].TransComplete(); Send[mu].TransComplete(); } } //------------------------------------------------------------------ // start by summing first over direction (mu) and then over site // to allow SCU transfers to happen face-by-face in the outermost // loop. //------------------------------------------------------------------ for(int i = 0;i<4;i++){ v1_buf_pos[i] = v1_buf[i]; v2_buf_pos[i] = v2_buf[i]; } for (mu=0; mu<4; mu++){ for (pos[3]=0; pos[3]<size[3]; pos[3]++){ for (pos[2]=0; pos[2]<size[2]; pos[2]++){ for (pos[1]=0; pos[1]<size[1]; pos[1]++){ for (pos[0]=0; pos[0]<size[0]; pos[0]++){ int gauge_offset = offset(size,pos); int vec_offset = f_site_size_4d*gauge_offset ; gauge_offset = mu+4*gauge_offset ; Float *v1_plus_mu ; Float *v2_plus_mu ; int vec_plus_mu_stride ; int vec_plus_mu_offset = f_site_size_4d ; Float coeff = -2.0 * dt ; // printf("%d %d %d %d %d\n",mu,pos[0],pos[1],pos[2],pos[3]); vec_plus_mu_offset *= offset(size,pos,mu); if ((GJP.Nodes(mu)>1)&&((pos[mu]+1) == size[mu]) ) { v1_plus_mu = v1_buf_pos[mu] ; v2_plus_mu = v2_buf_pos[mu] ; v1_buf_pos[mu] += f_site_size_4d; v2_buf_pos[mu] += f_site_size_4d; vec_plus_mu_stride = (surf[mu] -1)*f_site_size_4d ; if (GJP.NodeBc(mu)==BND_CND_APRD) coeff = -coeff ; // Float time2 = -dclock(); sproj_tr[mu]( (IFloat *)tmp_mat1, (IFloat *)v1_plus_mu, (IFloat *)v2+vec_offset, ls, vec_plus_mu_stride, f_size_4d-f_site_size_4d) ; sproj_tr[mu+4]( (IFloat *)tmp_mat2, (IFloat *)v2_plus_mu, (IFloat *)v1+vec_offset, ls, vec_plus_mu_stride, f_size_4d-f_site_size_4d) ; // time2 += dclock(); // print_flops("","sproj",2*9*16*ls,time2); *tmp_mat1 += *tmp_mat2 ; // If GJP.Snodes > 1 sum up contributions from all s nodes if(GJP.Snodes() >1 ) { glb_sum_multi_dir((Float *)tmp_mat1, 4, sizeof(Matrix)/sizeof(IFloat) ) ; } tmp_mat2->DotMEqual(*(gauge+gauge_offset), *tmp_mat1) ; tmp_mat1->Dagger(*tmp_mat2) ; tmp_mat2->TrLessAntiHermMatrix(*tmp_mat1) ; *tmp_mat2 *= coeff ; *(mom+gauge_offset) += *tmp_mat2 ; Float norm = tmp_mat2->norm(); Float tmp = sqrt(norm); L1 += tmp; L2 += norm; Linf = (tmp>Linf ? tmp : Linf); } } } } } // end for x,y,z,t } // end for mu ForceFlops += (2*9*16*ls + 18+ 198+36+24)*lx*ly*lz*lt*4; #ifdef PROFILE time += dclock(); print_flops(cname,fname,ForceFlops,time); #endif //------------------------------------------------------------------ // deallocate smalloc'd space //------------------------------------------------------------------ for(int i =0;i<4;i++) { ffree(v1_buf[i],cname,fname,"v1_buf"); ffree(v2_buf[i],cname,fname,"v2_buf"); } VRB.Sfree(cname, fname, str_v2, v2) ; ffree(v2) ; VRB.Sfree(cname, fname, str_v1, v1) ; ffree(v1) ; ffree(tmp_mat1); glb_sum(&L1); glb_sum(&L2); glb_max(&Linf); L1 /= 4.0*GJP.VolSites(); L2 /= 4.0*GJP.VolSites(); return ForceArg(L1, sqrt(L2), Linf); }