Пример #1
0
// f += coeff * U_dir(x) * v(x)^dagger
// Note that STAG order stores hermitian conjugate of links.
void Fasqtad::force_product_sum(const Matrix *v,  int dir,
				IFloat coeff, Matrix *f){
  unsigned long v2=(unsigned long)v;
  if( qalloc_is_fast((Matrix *)v) &&
      qalloc_is_fast((Matrix *)f) &&
      qalloc_is_fast((Matrix *)(GaugeField()+dir)))
    v2 = v2 - 0xb0000000 + 0x9c000000;

  int vol = node_sites[0]*node_sites[1]*node_sites[2]*node_sites[3];
  ForceFlops += vol * 234;
  fgdagm1dagpm2(f, &coeff, (GaugeField()+dir), (const Matrix*)v2, f, &vol); 
  
}
Пример #2
0
//------------------------------------------------------------------------------
// GforceSite(Matrix& force, int *x, int mu):
// It calculates the gauge force at site x and direction mu.
//------------------------------------------------------------------------------
void GimprRect::GforceSite(Matrix& force, int *x, int mu)
{
  const char *fname = "GforceSite(M&,i*,i)";
  setCbufCntrlReg(4, CBUF_MODE4);

  Matrix *u_off = GaugeField()+GsiteOffset(x)+mu;

  Matrix mt1;
  //----------------------------------------------------------------------------
  //  get staple
  //     mt1 = staple
  //----------------------------------------------------------------------------
  Staple(mt1, x, mu);	
  ForceFlops += 198*3*3+12+216*3;

  //----------------------------------------------------------------------------
  // mt2 = U_mu(x)
  //----------------------------------------------------------------------------
  Matrix mt2(*u_off);
  // moveMem((IFloat *)mp2, (IFloat *)u_off, MATRIX_SIZE * sizeof(IFloat)) ;

  //----------------------------------------------------------------------------
  // force = -(beta*(1-8*c_1)/3)*U_mu(x)*stap
  //----------------------------------------------------------------------------
  force.DotMEqual(mt2, mt1);
  force *= plaq_coeff;
  // mDotMEqual((IFloat *)&force, (const IFloat *)mp2, (const IFloat *)mp1);
  // tmp = plaq_coeff ;
  // vecTimesEquFloat((IFloat *)&force, tmp, MATRIX_SIZE);

  //----------------------------------------------------------------------------
  //  get rectangle staple
  //     mt1 = rect_stap
  //----------------------------------------------------------------------------
  RectStaple(mt1, x, mu);
  ForceFlops += 198*3*18+216*3*6;

  //----------------------------------------------------------------------------
  // mt2 = -(beta*c_1/3)*U_mu(x)
  //----------------------------------------------------------------------------
  // mt2 = *u_off;
  // moveMem((IFloat *)mp2, (IFloat *)u_off, MATRIX_SIZE*sizeof(IFloat));

  mt2 *= rect_coeff;
  // tmp = rect_coeff;
  // vecTimesEquFloat((IFloat *)mp2, tmp, MATRIX_SIZE) ;
  ForceFlops +=234;

  //----------------------------------------------------------------------------
  // force += -(beta*c_1/3)*U_mu(x)*rect_stap
  //----------------------------------------------------------------------------
  force.DotMPlus(mt2, mt1); 
  // mDotMPlus((IFloat *)&force, (const IFloat *)mp2, (const IFloat *)mp1);

  mt1.Dagger(force);
  force.TrLessAntiHermMatrix(mt1);
  ForceFlops +=198+24;
}
Пример #3
0
GaugeField Action_Nf2_ratio::md_force(){
#ifdef ANTIPERIODIC_BC
  BC->apply_bc(*u_);
#endif

  Field eta = DdagD1_inv(D2_->mult_dag(phi_));
  long double timing;
  FINE_TIMING_START(timing);

  GaugeField fce(D1_->md_force(eta,D1_->mult(eta)));
  fce -= GaugeField(D2_->md_force(eta,phi_));

  FINE_TIMING_END(timing);
  _Message(TIMING_VERB_LEVEL, "[Timing] - Action_Nf2_ratio::md_force"
           << " - Force terms timing = "
           << timing << std::endl);

#ifdef ANTIPERIODIC_BC
  BC->apply_bc(*u_);
#endif

  FINE_TIMING_START(timing);

  if(smeared_) smart_conf_->smeared_force(fce);

  FINE_TIMING_END(timing);
  _Message(TIMING_VERB_LEVEL, "[Timing] - Action_Nf2_ratio::md_force"
           << " - Smeared force timing = "
           << timing << std::endl);

  FINE_TIMING_START(timing);

  GaugeField force = FieldUtils::TracelessAntihermite(fce); 

  FINE_TIMING_END(timing);
  _Message(TIMING_VERB_LEVEL, "[Timing] - Action_Nf2_ratio::md_force"
           << " - TracelessAntihermite timing = "
           << timing << std::endl);


  _MonitorMsg(ACTION_VERB_LEVEL, Action,force, name_);
  return force;
}
Пример #4
0
//------------------------------------------------------------------
// "Odd" fermion force evolution routine written by Chris Dawson, taken 
// verbatim, so performance will suck on qcdoc.
//------------------------------------------------------------------
ForceArg FdwfBase::EvolveMomFforce( Matrix* mom, // momenta
                               Vector* phi, // odd pseudo-fermion field
                               Vector* eta, // very odd pseudo-fermion field
                               Float  mass, 
                               Float dt )
{
  char *fname = "EvolveMomFforce(M*,V*,V*,F,F)";
  VRB.Func(cname,fname);
  
  Matrix *gauge = GaugeField() ;

  if (Colors() != 3)       { ERR.General(cname,fname,"Wrong nbr of colors.") ; }
  if (SpinComponents()!=4) { ERR.General(cname,fname,"Wrong nbr of spin comp.") ;}
  if (mom == 0)            { ERR.Pointer(cname,fname,"mom") ; }
  if (phi == 0)            { ERR.Pointer(cname,fname,"phi") ; }
   
  // allocate space for two CANONICAL fermion fields

  // these are all full fermion vector sizes ( i.e. *not* preconditioned )

  const int f_size        ( FsiteSize() * GJP.VolNodeSites() );
  const int f_size_cb     ( f_size/2 ) ; // f_size must be multiple of 2
  const int f_site_size_4d( 2 * Colors() * SpinComponents() );
  const int f_size_4d     ( f_site_size_4d * GJP.VolNodeSites()) ;
  
  char *str_v1 = "v1" ;
  Vector *v1 = (Vector *)smalloc(f_size*sizeof(Float)) ;
  if (v1 == 0) ERR.Pointer(cname, fname, str_v1) ;
  VRB.Smalloc(cname, fname, str_v1, v1, f_size*sizeof(Float)) ;

  char *str_v2 = "v2" ;
  Vector *v2 = (Vector *)smalloc(f_size*sizeof(Float)) ;
  if (v2 == 0) ERR.Pointer(cname, fname, str_v2) ;
  VRB.Smalloc(cname, fname, str_v2, v2, f_size*sizeof(Float)) ;

  Float L1 = 0.0;
  Float L2 = 0.0;
  Float Linf = 0.0;

#ifdef PROFILE
  Float time = -dclock();
  ForceFlops=0;
#endif

  //Calculate v1, v2. Both must be in CANONICAL order afterwards
  {
    CgArg cg_arg ;
    cg_arg.mass = mass ;
    
    DiracOpDwf dwf(*this, v1, v2, &cg_arg, CNV_FRM_YES) ;
    Float kappa( 1.0 / ( 2 * (4 + GJP.DwfA5Inv() - GJP.DwfHeight())));

    v2->CopyVec(phi,f_size_cb);

    // rescale the input field. As the second half of the this field
    // will be constructed by acting with the PC dslash on v1, this
    // rescales *one* of the full vectors - giving rise to an overall
    // rescaling of the final answer by exactly -\kappa^2
    
    v2->VecTimesEquFloat(-kappa*kappa,f_size_cb);

    // only need one factor of -\kappa^2, so don't rescale the second
    // full vector (v2)
    v1->CopyVec(eta,f_size_cb);
        
    dwf.Dslash(v2+(f_size_cb/6), v2 , CHKB_ODD, DAG_YES);
    dwf.Dslash(v1+(f_size_cb/6), v1 , CHKB_ODD, DAG_NO);
    
    // v1 and v2 are now the vectors needed to contruct the force term
    // written in ( ODD, EVEN ) ordering. They will be converted back
    // into canonical ordering when the destructor is called.
    
  }  

  // two fermion vectors at a single position
  //    - these will be used to store off-node
  //      field components

 
  char *str_site_v1 = "site_v1" ;
  Float *site_v1 = (Float *)smalloc(FsiteSize()*sizeof(Float)) ;
  if (site_v1 == 0) ERR.Pointer(cname, fname, str_site_v1) ;
  VRB.Smalloc(cname, fname, str_site_v1, site_v1, FsiteSize()*sizeof(Float)) ;

  char *str_site_v2 = "site_v2" ;
  Float *site_v2 = (Float *)smalloc(FsiteSize()*sizeof(Float)) ;
  if (site_v2 == 0) ERR.Pointer(cname, fname, str_site_v2) ;
  VRB.Smalloc(cname, fname, str_site_v2, site_v2, FsiteSize()*sizeof(Float)) ;

  // evolve the momenta by the fermion force
  int mu, x, y, z, t, s;
 
  const int lx(GJP.XnodeSites());
  const int ly(GJP.YnodeSites());
  const int lz(GJP.ZnodeSites());
  const int lt(GJP.TnodeSites());
  const int ls(GJP.SnodeSites());
  
  // start by summing first over direction (mu) and then over site to
  // allow SCU transfers to happen face-by-face in the outermost loop.

  VRB.Clock(cname, fname, "Before loop over links.\n") ;

  for (mu=0; mu<4; mu++) {
    for (t=0; t<lt; t++){
      for (z=0; z<lz; z++){
        for (y=0; y<ly; y++){
          for (x=0; x<lx; x++) {
            // position offset
            int gauge_offset = x+lx*(y+ly*(z+lz*t));
            
            // offset for vector field at this point
            // (4d only, no fifth dimension)
            int vec_offset = f_site_size_4d*gauge_offset ;
            
            // offset for link in mu direction from this point
            gauge_offset = mu+4*gauge_offset ; 
            
            Float *v1_plus_mu=NULL ;
            Float *v2_plus_mu=NULL ;
            int vec_plus_mu_stride=0 ;
            int vec_plus_mu_offset = f_site_size_4d ;
            
            // sign of coeff (look at momenta update)
            Float coeff = -2.0 * dt ;
            
            switch (mu) 
              {
              case 0 :
                // next position in mu direction
                vec_plus_mu_offset *= (x+1)%lx+lx*(y+ly*(z+lz*t)) ;
                // vec_plus_mu_offset now the correct
                // offset for a fermion field at this point
                // in the lattice 
                if ((x+1) == lx) 
                  {
                    // off-node
                    for (s=0; s<ls; s++) 
                      {
                        // fill site_v1 and site_v2 with v1 and v2 data
                        // from x=0 on next node, need loop because
                        // data is not contiguous in memory 
                        getPlusData( (Float *)site_v1+s*f_site_size_4d,
                                     (Float *)v1+vec_plus_mu_offset+s*f_size_4d,
                                     f_site_size_4d, mu) ;
                        getPlusData( (Float*)site_v2+s*f_site_size_4d,
                                     (Float*)v2+vec_plus_mu_offset+s*f_size_4d,
                                     f_site_size_4d, mu) ;
                      } // end for s
                    
                    v1_plus_mu = site_v1   ;  
                    v2_plus_mu = site_v2   ;  
                    vec_plus_mu_stride = 0 ;  // field now contiguous
                    
                    // GJP.XnodeBc() gives the forward boundary
                    // condition only (so this should work).
                    if (GJP.XnodeBc()==BND_CND_APRD) coeff = -coeff ;
                  } 
                else 
                  {
                    // on - node
                    //
                    // just add offset to v1 and v2
                    // (they are now 1 forward in the mu direction )
                    //
                    v1_plus_mu = (Float*)v1+vec_plus_mu_offset ;
                    v2_plus_mu = (Float*)v2+vec_plus_mu_offset ;
                    vec_plus_mu_stride = f_size_4d - f_site_size_4d ; // explained below
                  }
                break ;
                // Repeat for the other directions
              case 1 :
                vec_plus_mu_offset *= x+lx*((y+1)%ly+ly*(z+lz*t)) ;
                if ((y+1) == ly) 
                  {
                    for (s=0; s<ls; s++) 
                      {
                        getPlusData( (Float*)site_v1+s*f_site_size_4d,
                                     (Float*)v1+vec_plus_mu_offset+s*f_size_4d,
                                     f_site_size_4d, mu) ;
                        getPlusData( (Float*)site_v2+s*f_site_size_4d,
                                     (Float*)v2+vec_plus_mu_offset+s*f_size_4d,
                                     f_site_size_4d, mu) ;
                      }
                    v1_plus_mu = site_v1 ;
                    v2_plus_mu = site_v2 ;
                    vec_plus_mu_stride = 0 ;
                    if (GJP.YnodeBc()==BND_CND_APRD) coeff = -coeff ;
                  } 
                else 
                  {
                    v1_plus_mu = (Float *)v1+vec_plus_mu_offset ;
                    v2_plus_mu = (Float *)v2+vec_plus_mu_offset ;
                    vec_plus_mu_stride = f_size_4d - f_site_size_4d ;
                  }
                break ;
              case 2 :
                vec_plus_mu_offset *= x+lx*(y+ly*((z+1)%lz+lz*t)) ;
                if ((z+1) == lz) 
                  {
                    for (s=0; s<ls; s++) 
                      {
                        getPlusData( (Float*)site_v1+s*f_site_size_4d,
                                     (Float*)v1+vec_plus_mu_offset+s*f_size_4d,
                                     f_site_size_4d, mu) ;
                        getPlusData( (Float*)site_v2+s*f_site_size_4d,
                                     (Float*)v2+vec_plus_mu_offset+s*f_size_4d,
                                     f_site_size_4d, mu) ;
                      }
                    v1_plus_mu = site_v1 ;
                    v2_plus_mu = site_v2 ;
                    vec_plus_mu_stride = 0 ;
                    if (GJP.ZnodeBc()==BND_CND_APRD) coeff = -coeff ;
                  } 
                else 
                  {
                    v1_plus_mu = (Float *)v1+vec_plus_mu_offset ;
                    v2_plus_mu = (Float *)v2+vec_plus_mu_offset ;
                    vec_plus_mu_stride = f_size_4d - f_site_size_4d ;
                  }
                break ;
              case 3 :
                vec_plus_mu_offset *= x+lx*(y+ly*(z+lz*((t+1)%lt))) ;
                if ((t+1) == lt) 
                  {
                    for (s=0; s<ls; s++) 
                      {
                        getPlusData( (Float*)site_v1+s*f_site_size_4d,
                                     (Float*)v1+vec_plus_mu_offset+s*f_size_4d,
                                     f_site_size_4d, mu) ;
                        getPlusData( (Float*)site_v2+s*f_site_size_4d,
                                     (Float*)v2+vec_plus_mu_offset+s*f_size_4d,
                                     f_site_size_4d, mu) ;
                      } 
                    v1_plus_mu = site_v1 ;
                    v2_plus_mu = site_v2 ;
                    vec_plus_mu_stride = 0 ;
                    if (GJP.TnodeBc()==BND_CND_APRD) coeff = -coeff ;
                  } 
                else 
                  {
                    v1_plus_mu = (Float *)v1+vec_plus_mu_offset ;
                    v2_plus_mu = (Float *)v2+vec_plus_mu_offset ;
                    vec_plus_mu_stride = f_size_4d - f_site_size_4d ;
                  }
              } // end (the evil) mu switch 


            Matrix tmp_mat1, tmp_mat2;  

            // the non-zero stride pattern is due to domain wall
            // fermions ( summing up *ls* different sproj's )
            //
            // f_size_4d-f_site_size_4d is the number of floats
            // between the end of one spinor at s and the start of the 
            // spinor at s+1 
            // 
            // vec_plus_mu_stride is the same, except when
            // this is off boundary, in that case the info
            // is copied into a contiguous block in the above code
            // and vec_plus_mu_stride set to zero
            
            // ( 1 - gamma_\mu ) Tr_s [ v1(x+\mu) v2^{\dagger}(x) ]
            
            sproj_tr[mu]( (Float *)&tmp_mat1,
                          (Float *)v1_plus_mu,
                          (Float *)v2+vec_offset,
                          ls, vec_plus_mu_stride, f_size_4d-f_site_size_4d) ;
            
            // (1 + gamma_\mu)  Tr_s [ v2(x+\mu) v1^{\dagger}(x) ]
            sproj_tr[mu+4]( (Float *)&tmp_mat2,
                            (Float *)v2_plus_mu,
                            (Float *)v1+vec_offset,
                            ls, vec_plus_mu_stride, f_size_4d-f_site_size_4d) ;
            
            
            // exactly what is sounds like
            tmp_mat1 += tmp_mat2 ;
            
            if(GJP.Snodes() != 1) {
              for (s=0; s<(sizeof(Matrix)/sizeof(Float)); ++s) {
                glb_sum_dir((Float *)&tmp_mat1 + s, 4) ;
              }
            }
            
            // multiply sum by the link in the \mu direction
            tmp_mat2.DotMEqual(*(gauge+gauge_offset), tmp_mat1) ;
            
            // take tracless antihermitian piece
            // TrLessAntiHermMatrix need to be passed
            // the dagger of the matrix in question
            tmp_mat1.Dagger(tmp_mat2) ;
            tmp_mat2.TrLessAntiHermMatrix(tmp_mat1) ;

            tmp_mat2 *= coeff ;
            
            // note the minus sign.
            *(mom+gauge_offset) -= tmp_mat2 ;
	    Float norm = tmp_mat2.norm();
	    Float tmp = sqrt(norm);
	    L1 += tmp;
	    L2 += norm;
	    Linf = (tmp>Linf ? tmp : Linf);
	    
          } // end for x
        } // end for y
      } // end for z
    } // end for t
  } // end for mu
  ForceFlops += (2*9*16*ls + 18+ 198+36+24)*lx*ly*lz*lt*4;
#ifdef PROFILE
  time += dclock();
  print_flops(cname,fname,ForceFlops,time);
#endif
  
  // deallocate smalloc'd space

  VRB.Sfree(cname, fname, str_site_v2, site_v2) ;
  sfree(site_v2) ;
 
  VRB.Sfree(cname, fname, str_site_v1, site_v1) ;
  sfree(site_v1) ;
 
  VRB.Sfree(cname, fname, str_v2, v2) ;
  sfree(v2) ;
  
  VRB.Sfree(cname, fname, str_v1, v1) ;
  sfree(v1) ;

  glb_sum(&L1);
  glb_sum(&L2);
  glb_max(&Linf);

  L1 /= 4.0*GJP.VolSites();
  L2 /= 4.0*GJP.VolSites();

  VRB.FuncEnd(cname,fname);
  return ForceArg(L1, sqrt(L2), Linf);

}
Пример #5
0
CPS_START_NAMESPACE
/*!\file
  \brief  Implementation of FdwfBase class.

  $Id: f_dwf_base_force.C,v 1.14 2012-08-31 04:55:08 chulwoo Exp $
*/
//--------------------------------------------------------------------
//  CVS keywords
//
//  $Source: /home/chulwoo/CPS/repo/CVS/cps_only/cps_pp/src/util/lattice/f_dwf_base/noarch/f_dwf_base_force.C,v $
//  $State: Exp $
//
//--------------------------------------------------------------------
//------------------------------------------------------------------
//
// f_dwf_base_force.C
//
// (R)HMC force term for FdwfBase
//
//------------------------------------------------------------------

CPS_END_NAMESPACE
#include <util/qcdio.h>
#include <math.h>
#include <util/lattice.h>
#include <util/dirac_op.h>
#include <util/dwf.h>
#include <util/gjp.h>
#include <util/verbose.h>
#include <util/vector.h>
#include <util/random.h>
#include <util/error.h>
#include <util/time_cps.h>
#include <comms/scu.h> // GRF
#include <comms/glb.h>
CPS_START_NAMESPACE
#undef PROFILE

// CJ: change start
//------------------------------------------------------------------
// EvolveMomFforce(Matrix *mom, Vector *chi, Float mass, 
//                 Float dt):
// It evolves the canonical momentum mom by dt
// using the fermion force.
//------------------------------------------------------------------
ForceArg FdwfBase::EvolveMomFforce(Matrix *mom, Vector *chi, 
			   Float mass, Float dt){
  char *fname = "EvolveMomFforce(M*,V*,F,F,F)";
  VRB.Func(cname,fname);

  Matrix *gauge = GaugeField() ;

  if (Colors() != 3)
    ERR.General(cname,fname,"Wrong nbr of colors.") ;
 
  if (SpinComponents() != 4)
    ERR.General(cname,fname,"Wrong nbr of spin comp.") ;
 
  if (mom == 0)
    ERR.Pointer(cname,fname,"mom") ;
 
  if (chi == 0)
    ERR.Pointer(cname,fname,"chi") ;
 
  //----------------------------------------------------------------
  // allocate space for two CANONICAL fermion fields
  //----------------------------------------------------------------

  int f_size = FsiteSize() * GJP.VolNodeSites() ;
  int f_site_size_4d = 2 * Colors() * SpinComponents();
  int f_size_4d = f_site_size_4d * GJP.VolNodeSites() ;
 
  char *str_v1 = "v1" ;
  Vector *v1 = (Vector *)smalloc(f_size*sizeof(Float)) ;
  if (v1 == 0) ERR.Pointer(cname, fname, str_v1) ;
  VRB.Smalloc(cname, fname, str_v1, v1, f_size*sizeof(Float)) ;

  char *str_v2 = "v2" ;
  Vector *v2 = (Vector *)smalloc(f_size*sizeof(Float)) ;
  if (v2 == 0) ERR.Pointer(cname, fname, str_v2) ;
  VRB.Smalloc(cname, fname, str_v2, v2, f_size*sizeof(Float)) ;

  //----------------------------------------------------------------
  // allocate buffer space for two fermion fields that are assoc
  // with only one 4-D site.
  //----------------------------------------------------------------

  char *str_site_v1 = "site_v1" ;
  Float *site_v1 = (Float *)smalloc(FsiteSize()*sizeof(Float)) ;
  if (site_v1 == 0) ERR.Pointer(cname, fname, str_site_v1) ;
  VRB.Smalloc(cname, fname, str_site_v1, site_v1, FsiteSize()*sizeof(Float)) ;

  char *str_site_v2 = "site_v2" ;
  Float *site_v2 = (Float *)smalloc(FsiteSize()*sizeof(Float)) ;
  if (site_v2 == 0) ERR.Pointer(cname, fname, str_site_v2) ;
  VRB.Smalloc(cname, fname, str_site_v2, site_v2, FsiteSize()*sizeof(Float)) ;

  Float L1 = 0.0;
  Float L2 = 0.0;
  Float Linf = 0.0;

  //----------------------------------------------------------------
  // Calculate v1, v2. Both v1, v2 must be in CANONICAL order after
  // the calculation.
  //----------------------------------------------------------------  

  VRB.Clock(cname, fname, "Before calc force vecs.\n") ;
  VRB.Flow(cname, fname, "Before calc force vecs.\n") ;

  {
    CgArg cg_arg ;
    cg_arg.mass = mass ;

    DiracOpDwf dwf(*this, v1, v2, &cg_arg, CNV_FRM_YES) ;
    dwf.CalcHmdForceVecs(chi) ;
  }
  VRB.Flow(cname, fname, "After calc force vecs.\n") ;
#ifdef PROFILE
  Float time = -dclock();
  ForceFlops=0;
#endif

  int mu, x, y, z, t, s, lx, ly, lz, lt, ls ;
 
  lx = GJP.XnodeSites() ;
  ly = GJP.YnodeSites() ;
  lz = GJP.ZnodeSites() ;
  lt = GJP.TnodeSites() ;
  ls = GJP.SnodeSites() ;

  Matrix tmp_mat1, tmp_mat2 ;
 
//------------------------------------------------------------------
// start by summing first over direction (mu) and then over site
// to allow SCU transfers to happen face-by-face in the outermost
// loop.
//------------------------------------------------------------------

  VRB.Clock(cname, fname, "Before loop over links.\n") ;

  for (mu=0; mu<4; mu++){
    for (t=0; t<lt; t++){
    for (z=0; z<lz; z++){
    for (y=0; y<ly; y++){
    for (x=0; x<lx; x++){
      int gauge_offset = x+lx*(y+ly*(z+lz*t)) ;
      int vec_offset = f_site_size_4d*gauge_offset ;
      gauge_offset = mu+4*gauge_offset ;

      Float *v1_plus_mu ;
      Float *v2_plus_mu ;
      int vec_plus_mu_stride ;
      int vec_plus_mu_offset = f_site_size_4d ;

      Float coeff = -2.0 * dt ;

      switch (mu) {
        case 0 :
          vec_plus_mu_offset *= (x+1)%lx+lx*(y+ly*(z+lz*t)) ;
          if ((x+1) == lx) {
            for (s=0; s<ls; s++) {
              getPlusData( (IFloat *)site_v1+s*f_site_size_4d,
                (IFloat *)v1+vec_plus_mu_offset+s*f_size_4d,
                f_site_size_4d, mu) ;
              getPlusData( (IFloat *)site_v2+s*f_site_size_4d,
                (IFloat *)v2+vec_plus_mu_offset+s*f_size_4d,
                f_site_size_4d, mu) ;
            } // end for s
            v1_plus_mu = site_v1 ;
            v2_plus_mu = site_v2 ;
            vec_plus_mu_stride = 0 ;
            if (GJP.XnodeBc()==BND_CND_APRD) coeff = -coeff ;
          } else {
            v1_plus_mu = (Float *)v1+vec_plus_mu_offset ;
            v2_plus_mu = (Float *)v2+vec_plus_mu_offset ;
            vec_plus_mu_stride = f_size_4d - f_site_size_4d ;
          }
          break ;
        case 1 :
          vec_plus_mu_offset *= x+lx*((y+1)%ly+ly*(z+lz*t)) ;
          if ((y+1) == ly) {
            for (s=0; s<ls; s++) {
              getPlusData( (IFloat *)site_v1+s*f_site_size_4d,
                (IFloat *)v1+vec_plus_mu_offset+s*f_size_4d,
                f_site_size_4d, mu) ;
              getPlusData( (IFloat *)site_v2+s*f_site_size_4d,
                (IFloat *)v2+vec_plus_mu_offset+s*f_size_4d,
                f_site_size_4d, mu) ;
            } // end for s
            v1_plus_mu = site_v1 ;
            v2_plus_mu = site_v2 ;
            vec_plus_mu_stride = 0 ;
            if (GJP.YnodeBc()==BND_CND_APRD) coeff = -coeff ;
          } else {
            v1_plus_mu = (Float *)v1+vec_plus_mu_offset ;
            v2_plus_mu = (Float *)v2+vec_plus_mu_offset ;
            vec_plus_mu_stride = f_size_4d - f_site_size_4d ;
          }
          break ;
        case 2 :
          vec_plus_mu_offset *= x+lx*(y+ly*((z+1)%lz+lz*t)) ;
          if ((z+1) == lz) {
            for (s=0; s<ls; s++) {
              getPlusData( (IFloat *)site_v1+s*f_site_size_4d,
                (IFloat *)v1+vec_plus_mu_offset+s*f_size_4d,
                f_site_size_4d, mu) ;
              getPlusData( (IFloat *)site_v2+s*f_site_size_4d,
                (IFloat *)v2+vec_plus_mu_offset+s*f_size_4d,
                f_site_size_4d, mu) ;
            } // end for s
            v1_plus_mu = site_v1 ;
            v2_plus_mu = site_v2 ;
            vec_plus_mu_stride = 0 ;
            if (GJP.ZnodeBc()==BND_CND_APRD) coeff = -coeff ;
          } else {
            v1_plus_mu = (Float *)v1+vec_plus_mu_offset ;
            v2_plus_mu = (Float *)v2+vec_plus_mu_offset ;
            vec_plus_mu_stride = f_size_4d - f_site_size_4d ;
          }
          break ;
        case 3 :
          vec_plus_mu_offset *= x+lx*(y+ly*(z+lz*((t+1)%lt))) ;
          if ((t+1) == lt) {
            for (s=0; s<ls; s++) {
              getPlusData( (IFloat *)site_v1+s*f_site_size_4d,
                (IFloat *)v1+vec_plus_mu_offset+s*f_size_4d,
                f_site_size_4d, mu) ;
              getPlusData( (IFloat *)site_v2+s*f_site_size_4d,
                (IFloat *)v2+vec_plus_mu_offset+s*f_size_4d,
                f_site_size_4d, mu) ;
            } // end for s
            v1_plus_mu = site_v1 ;
            v2_plus_mu = site_v2 ;
            vec_plus_mu_stride = 0 ;
            if (GJP.TnodeBc()==BND_CND_APRD) coeff = -coeff ;
          } else {
            v1_plus_mu = (Float *)v1+vec_plus_mu_offset ;
            v2_plus_mu = (Float *)v2+vec_plus_mu_offset ;
            vec_plus_mu_stride = f_size_4d - f_site_size_4d ;
          }
      } // end switch mu 

      sproj_tr[mu]( (IFloat *)&tmp_mat1,
                    (IFloat *)v1_plus_mu,
                    (IFloat *)v2+vec_offset,
                    ls, vec_plus_mu_stride, f_size_4d-f_site_size_4d) ;

      sproj_tr[mu+4]( (IFloat *)&tmp_mat2,
                      (IFloat *)v2_plus_mu,
                      (IFloat *)v1+vec_offset,
                      ls, vec_plus_mu_stride, f_size_4d-f_site_size_4d) ;

      tmp_mat1 += tmp_mat2 ;

      // If GJP.Snodes > 1 sum up contributions from all s nodes
      if(GJP.Snodes() > 1) {
//        if(!UniqueID())printf("%s::%s:GJP.Snodes()=%d\n",cname,fname,GJP.Snodes()); 
	glb_sum_multi_dir((Float *)&tmp_mat1,4,sizeof(Matrix)/sizeof(IFloat));
      }

      tmp_mat2.DotMEqual(*(gauge+gauge_offset), tmp_mat1) ;

      tmp_mat1.Dagger(tmp_mat2) ;

      tmp_mat2.TrLessAntiHermMatrix(tmp_mat1) ;

      tmp_mat2 *= coeff ;

      *(mom+gauge_offset) += tmp_mat2 ;
      Float norm = tmp_mat2.norm();
      Float tmp = sqrt(norm);
      L1 += tmp;
      L2 += norm;
      Linf = (tmp>Linf ? tmp : Linf);

    } } } } // end for x,y,z,t
  } // end for mu
  ForceFlops += (2*9*16*ls + 18+ 198+36+24)*lx*ly*lz*lt*4;
#ifdef PROFILE
  time += dclock();
  print_flops(cname,fname,ForceFlops,time);
#endif
 
//------------------------------------------------------------------
// deallocate smalloc'd space
//------------------------------------------------------------------
  VRB.Sfree(cname, fname, str_site_v2, site_v2) ;
  sfree(site_v2) ;
 
  VRB.Sfree(cname, fname, str_site_v1, site_v1) ;
  sfree(site_v1) ;
 
  VRB.Sfree(cname, fname, str_v2, v2) ;
  sfree(v2) ;
 
  VRB.Sfree(cname, fname, str_v1, v1) ;
  sfree(v1) ;
 
  glb_sum(&L1);
  glb_sum(&L2);
  glb_max(&Linf);

  L1 /= 4.0*GJP.VolSites();
  L2 /= 4.0*GJP.VolSites();

  VRB.FuncEnd(cname,fname);
  return ForceArg(L1, sqrt(L2), Linf);

}
Пример #6
0
//------------------------------------------------------------------
// EvolveMomFforce(Matrix *mom, Vector *frm, Float mass, 
//                 Float dt):
// It evolves the canonical momentum mom by dt
// using the fermion force.
//------------------------------------------------------------------
ForceArg Fwilson::EvolveMomFforce(Matrix *mom, Vector *chi, 
			      Float mass, Float dt)
{
  char *fname = "EvolveMomFforce(M*,V*,F,F,F)";
  VRB.Func(cname,fname);

  Matrix *gauge = GaugeField() ;

  if (Colors() != 3)
    ERR.General(cname,fname,"Wrong nbr of colors.") ;

  if (SpinComponents() != 4)
    ERR.General(cname,fname,"Wrong nbr of spin comp.") ;

  if (mom == 0)
    ERR.Pointer(cname,fname,"mom") ;

  if (chi == 0)
    ERR.Pointer(cname,fname,"chi") ;

//------------------------------------------------------------------
// allocate space for two CANONICAL fermion fields.
//------------------------------------------------------------------
  int f_size = FsiteSize() * GJP.VolNodeSites() ;

  char *str_v1 = "v1" ;
  Vector *v1 = (Vector *)smalloc(f_size*sizeof(Float)) ;
  if (v1 == 0)
    ERR.Pointer(cname, fname, str_v1) ;
  VRB.Smalloc(cname, fname, str_v1, v1, f_size*sizeof(Float)) ;

  char *str_v2 = "v2" ;
  Vector *v2 = (Vector *)smalloc(f_size*sizeof(Float)) ;
  if (v2 == 0)
    ERR.Pointer(cname, fname, str_v2) ;
  VRB.Smalloc(cname, fname, str_v2, v2, f_size*sizeof(Float)) ;

//------------------------------------------------------------------
// allocate space for two CANONICAL fermion field on a site.
//------------------------------------------------------------------

  char *str_site_v1 = "site_v1";
  Float *site_v1 = (Float *)smalloc(FsiteSize()*sizeof(Float));
  if (site_v1 == 0)
    ERR.Pointer(cname, fname, str_site_v1) ;
  VRB.Smalloc(cname, fname, str_site_v1, site_v1,
    FsiteSize()*sizeof(Float)) ;

  char *str_site_v2 = "site_v2";
  Float *site_v2 = (Float *)smalloc(FsiteSize()*sizeof(Float));
  if (site_v2 == 0)
    ERR.Pointer(cname, fname, str_site_v2) ;
  VRB.Smalloc(cname, fname, str_site_v2, site_v2,
    FsiteSize()*sizeof(Float)) ;

  {
    CgArg cg_arg ;
    cg_arg.mass = mass ;

    DiracOpWilson wilson(*this, v1, v2, &cg_arg, CNV_FRM_YES) ;
    wilson.CalcHmdForceVecs(chi) ;
  }

  int x, y, z, t, lx, ly, lz, lt ;

  lx = GJP.XnodeSites() ;
  ly = GJP.YnodeSites() ;
  lz = GJP.ZnodeSites() ;
  lt = GJP.TnodeSites() ;

//------------------------------------------------------------------
// start by summing first over direction (mu) and then over site
// to allow SCU transfers to happen face-by-face in the outermost
// loop.
//------------------------------------------------------------------

  int mu ;

  Matrix tmp, f ;

  Float L1 = 0.0;
  Float L2 = 0.0;
  Float Linf = 0.0;

  for (mu=0; mu<4; mu++) {
    for (t=0; t<lt; t++)
    for (z=0; z<lz; z++)
    for (y=0; y<ly; y++)
    for (x=0; x<lx; x++) {
      int gauge_offset = x+lx*(y+ly*(z+lz*t)) ;
      int vec_offset = FsiteSize()*gauge_offset ;
      gauge_offset = mu+4*gauge_offset ;

      Float *v1_plus_mu ;
      Float *v2_plus_mu ;
      int vec_plus_mu_offset = FsiteSize() ;

      Float coeff = -2.0 * dt ;

      switch (mu) {
        case 0 :
          vec_plus_mu_offset *= (x+1)%lx+lx*(y+ly*(z+lz*t)) ;
          if ((x+1) == lx) {
            getPlusData( (IFloat *)site_v1,
                         (IFloat *)v1+vec_plus_mu_offset, FsiteSize(), mu) ;
            getPlusData( (IFloat *)site_v2,
                         (IFloat *)v2+vec_plus_mu_offset, FsiteSize(), mu) ;
            v1_plus_mu = site_v1 ;                        
            v2_plus_mu = site_v2 ;                        
            if (GJP.XnodeBc()==BND_CND_APRD) coeff = -coeff ;
          } else {
            v1_plus_mu = (Float *)v1+vec_plus_mu_offset ;
            v2_plus_mu = (Float *)v2+vec_plus_mu_offset ;
          }
          break ;
        case 1 :
          vec_plus_mu_offset *= x+lx*((y+1)%ly+ly*(z+lz*t)) ;
          if ((y+1) == ly) {
            getPlusData( (IFloat *)site_v1,
                         (IFloat *)v1+vec_plus_mu_offset, FsiteSize(), mu) ;
            getPlusData( (IFloat *)site_v2,
                         (IFloat *)v2+vec_plus_mu_offset, FsiteSize(), mu) ;
            v1_plus_mu = site_v1 ;                        
            v2_plus_mu = site_v2 ;                        
            if (GJP.YnodeBc()==BND_CND_APRD) coeff = -coeff ;
          } else {
            v1_plus_mu = (Float *)v1+vec_plus_mu_offset ;
            v2_plus_mu = (Float *)v2+vec_plus_mu_offset ;
          }
          break ;
        case 2 :
          vec_plus_mu_offset *= x+lx*(y+ly*((z+1)%lz+lz*t)) ;
          if ((z+1) == lz) {
            getPlusData( (IFloat *)site_v1,
                         (IFloat *)v1+vec_plus_mu_offset, FsiteSize(), mu) ;
            getPlusData( (IFloat *)site_v2,
                         (IFloat *)v2+vec_plus_mu_offset, FsiteSize(), mu) ;
            v1_plus_mu = site_v1 ;
            v2_plus_mu = site_v2 ;
            if (GJP.ZnodeBc()==BND_CND_APRD) coeff = -coeff ;
          } else {
            v1_plus_mu = (Float *)v1+vec_plus_mu_offset ;
            v2_plus_mu = (Float *)v2+vec_plus_mu_offset ;
          }
          break ;
        case 3 :
          vec_plus_mu_offset *= x+lx*(y+ly*(z+lz*((t+1)%lt))) ;
          if ((t+1) == lt) {
            getPlusData( (IFloat *)site_v1,
                         (IFloat *)v1+vec_plus_mu_offset, FsiteSize(), mu) ;
            getPlusData( (IFloat *)site_v2,
                         (IFloat *)v2+vec_plus_mu_offset, FsiteSize(), mu) ;
            v1_plus_mu = site_v1 ;
            v2_plus_mu = site_v2 ;
            if (GJP.TnodeBc()==BND_CND_APRD) coeff = -coeff ;
          } else {
            v1_plus_mu = (Float *)v1+vec_plus_mu_offset ;
            v2_plus_mu = (Float *)v2+vec_plus_mu_offset ;
          }
      } // end switch mu

      sproj_tr[mu](   (IFloat *)&tmp,
                      (IFloat *)v1_plus_mu,
                      (IFloat *)v2+vec_offset, 1, 0, 0);

      sproj_tr[mu+4]( (IFloat *)&f,
                      (IFloat *)v2_plus_mu,
                      (IFloat *)v1+vec_offset, 1, 0, 0);

      tmp += f ;

      f.DotMEqual(*(gauge+gauge_offset), tmp) ;

      tmp.Dagger(f) ;

      f.TrLessAntiHermMatrix(tmp) ;

      f *= coeff ;

      *(mom+gauge_offset) += f ;
      Float norm = f.norm();
      Float tmp = sqrt(norm);
      L1 += tmp;
      L2 += norm;
      Linf = (tmp>Linf ? tmp : Linf);
    }
  }

//------------------------------------------------------------------
// deallocate space for two CANONICAL fermion fields on a site.
//------------------------------------------------------------------
  VRB.Sfree(cname, fname, str_site_v2, site_v2) ;
  sfree(site_v2) ;

  VRB.Sfree(cname, fname, str_site_v1, site_v1) ;
  sfree(site_v1) ;

//------------------------------------------------------------------
// deallocate space for two CANONICAL fermion fields.
//------------------------------------------------------------------
  VRB.Sfree(cname, fname, str_v2, v2) ;
  sfree(v2) ;

  VRB.Sfree(cname, fname, str_v1, v1) ;
  sfree(v1) ;

  glb_sum(&L1);
  glb_sum(&L2);
  glb_max(&Linf);

  L1 /= 4.0*GJP.VolSites();
  L2 /= 4.0*GJP.VolSites();

  return ForceArg(L1, sqrt(L2), Linf);
}
Пример #7
0
// CJ: change start
//------------------------------------------------------------------
// EvolveMomFforce(Matrix *mom, Vector *chi, Float mass, 
//                 Float dt):
// It evolves the canonical momentum mom by dt
// using the fermion force.
//------------------------------------------------------------------
ForceArg FdwfBase::EvolveMomFforce(Matrix *mom, Vector *chi, 
			   Float mass, Float dt){
  char *fname = "EvolveMomFforce(M*,V*,F,F,F)";
  VRB.Func(cname,fname);
  Matrix *gauge = GaugeField() ;

  if (Colors() != 3)
    ERR.General(cname,fname,"Wrong nbr of colors.") ;
 
  if (SpinComponents() != 4)
    ERR.General(cname,fname,"Wrong nbr of spin comp.") ;
 
  if (mom == 0)
    ERR.Pointer(cname,fname,"mom") ;
 
  if (chi == 0)
    ERR.Pointer(cname,fname,"chi") ;
 
  //----------------------------------------------------------------
  // allocate space for two CANONICAL fermion fields
  //----------------------------------------------------------------

  int f_size = FsiteSize() * GJP.VolNodeSites() ;
  int f_site_size_4d = 2 * Colors() * SpinComponents();
  int f_size_4d = f_site_size_4d * GJP.VolNodeSites() ;
 
  char *str_v1 = "v1" ;
  Vector *v1 = (Vector *)fmalloc(cname,fname,str_v1,f_size*sizeof(Float));
//  if (v1 == 0) ERR.Pointer(cname, fname, str_v1) ;
//  VRB.Smalloc(cname, fname, str_v1, v1, f_size*sizeof(Float)) ;

  char *str_v2 = "v2" ;
  Vector *v2 = (Vector *)fmalloc(cname,fname,str_v2,f_size*sizeof(Float)) ;
//  if (v2 == 0) ERR.Pointer(cname, fname, str_v2) ;
//  VRB.Smalloc(cname, fname, str_v2, v2, f_size*sizeof(Float)) ;

//  LatMatrix MomDiff(QFAST,4);
//  Matrix *mom_diff = MomDiff.Mat();

  Float L1 = 0.0;
  Float L2 = 0.0;
  Float Linf = 0.0;

  //----------------------------------------------------------------
  // Calculate v1, v2. Both v1, v2 must be in CANONICAL order after
  // the calculation.
  //----------------------------------------------------------------  

  VRB.Clock(cname, fname, "Before calc force vecs.\n") ;

#ifdef PROFILE
  Float time = -dclock();
  ForceFlops=0;
#endif
  {
    CgArg cg_arg ;
    cg_arg.mass = mass ;

    DiracOpDwf dwf(*this, v1, v2, &cg_arg, CNV_FRM_NO) ;
    dwf.CalcHmdForceVecs(chi) ;
    Fconvert(v1,CANONICAL,WILSON);
    Fconvert(v2,CANONICAL,WILSON);
  }
#ifdef PROFILE
  time += dclock();
  print_flops(cname,fname,ForceFlops,time);
#endif

  int mu, x, y, z, t, s, lx, ly, lz, lt, ls ;
  int size[5],surf[4],blklen[4],stride[4],numblk[4];
 
  size[0] = lx = GJP.XnodeSites() ;
  size[1] = ly = GJP.YnodeSites() ;
  size[2] = lz = GJP.ZnodeSites() ;
  size[3] = lt = GJP.TnodeSites() ;
  size[4] = ls = GJP.SnodeSites() ;

  blklen[0] = sizeof(Float)*FsiteSize()/size[4];
  numblk[0] = GJP.VolNodeSites()/size[0]*size[4];
  stride[0] = blklen[0] * (size[0]-1);
  for (int i =1;i<4;i++){
    blklen[i] = blklen[i-1] * size[i-1];
	numblk[i] = numblk[i-1] / size[i];
    stride[i] = blklen[i] * (size[i]-1);
  }
  for (int i =0;i<4;i++){
    surf[i] = GJP.VolNodeSites()/size[i];
  }

  //----------------------------------------------------------------
  // allocate buffer space for two fermion fields that are assoc
  // with only one 4-D site.
  //----------------------------------------------------------------

  unsigned long flag = QFAST;
  if (ls<4) flag|= QNONCACHE;
  char *str_site_v1 = "site_v1" ;
  char *str_site_v2 = "site_v2" ;

  Float *v1_buf[4],*v2_buf[4];
  int pos[4];
  Float *v1_buf_pos[4];
  Float *v2_buf_pos[4];
  for(int i =0;i<4;i++) {
    v1_buf[i]=(Float *)fmalloc(cname,fname,"v1_buf",surf[i]*FsiteSize()*sizeof(Float)) ;
    v2_buf[i]=(Float *)fmalloc(cname,fname,"v2_buf",surf[i]*FsiteSize()*sizeof(Float)) ;
  }

//  Matrix tmp_mat1, tmp_mat2 ;
  Matrix *tmp_mat1,*tmp_mat2;
  tmp_mat1 = (Matrix *)fmalloc(cname,fname,"tmp_mat1",sizeof(Matrix)*2);
  tmp_mat2 = tmp_mat1+1;

  SCUDirArgIR Send[4];
  SCUDirArgIR Recv[4];
  SCUDMAInst *dma[2];
  for(int i = 0;i<2;i++) dma[i] = new SCUDMAInst;
  void *addr[2];
  int f_bytes = sizeof(Float)*f_site_size_4d;
  int st_bytes = sizeof(Float)*f_size_4d - f_bytes;
  for (mu=0; mu<4; mu++){
    if ( GJP.Nodes(mu) >1){
      dma[0]->Init(v1_buf[mu],surf[mu]*ls*f_bytes);
      dma[1]->Init(v2_buf[mu],surf[mu]*ls*f_bytes);
      Recv[mu].Init(gjp_scu_dir[2*mu],SCU_REC,dma,2);
      dma[0]->Init(v1,blklen[mu],numblk[mu],stride[mu]);
      dma[1]->Init(v2,blklen[mu],numblk[mu],stride[mu]);
      Send[mu].Init(gjp_scu_dir[2*mu+1],SCU_SEND,dma,2);
    }
  }
  for(int i = 0;i<2;i++) delete dma[i];

  VRB.Clock(cname, fname, "Before loop over links.\n") ;

#ifdef PROFILE
  time = -dclock();
  ForceFlops=0;
#endif

  sys_cacheflush(0);
  for (mu=0; mu<4; mu++){
    if ( GJP.Nodes(mu) >1){
      Recv[mu].StartTrans();
      Send[mu].StartTrans();
    }
  }
  for (mu=0; mu<4; mu++){
    for (pos[3]=0; pos[3]<size[3]; pos[3]++){
    for (pos[2]=0; pos[2]<size[2]; pos[2]++){
    for (pos[1]=0; pos[1]<size[1]; pos[1]++){
    for (pos[0]=0; pos[0]<size[0]; pos[0]++){
      int gauge_offset = offset(size,pos);
      int vec_offset = f_site_size_4d*gauge_offset ;
      gauge_offset = mu+4*gauge_offset ;

//      printf("%d %d %d %d %d\n",mu,pos[0],pos[1],pos[2],pos[3]);
      Float *v1_plus_mu ;
      Float *v2_plus_mu ;
      int vec_plus_mu_stride ;
      int vec_plus_mu_offset = f_site_size_4d ;

      Float coeff = -2.0 * dt ;

          vec_plus_mu_offset *= offset(size,pos,mu);
      if ((GJP.Nodes(mu)>1)&&((pos[mu]+1) == size[mu]) ) {
      } else {
            v1_plus_mu = (Float *)v1+vec_plus_mu_offset ;
            v2_plus_mu = (Float *)v2+vec_plus_mu_offset ;
            vec_plus_mu_stride = f_size_4d - f_site_size_4d ;
            if ((pos[mu]+1) == size[mu])
            if (GJP.NodeBc(mu)==BND_CND_APRD) coeff = -coeff ;

//     Float time2 = -dclock();
      sproj_tr[mu]( (IFloat *)tmp_mat1,
                    (IFloat *)v1_plus_mu,
                    (IFloat *)v2+vec_offset,
                    ls, vec_plus_mu_stride, f_size_4d-f_site_size_4d) ;

      sproj_tr[mu+4]( (IFloat *)tmp_mat2,
                      (IFloat *)v2_plus_mu,
                      (IFloat *)v1+vec_offset,
                      ls, vec_plus_mu_stride, f_size_4d-f_site_size_4d) ;
 //    time2 += dclock();
 //    print_flops("","sproj",2*9*16*ls,time2);

      *tmp_mat1 += *tmp_mat2 ;

      // If GJP.Snodes > 1 sum up contributions from all s nodes
      if(GJP.Snodes() > 1) {
	  glb_sum_multi_dir((Float *)tmp_mat1, 4, sizeof(Matrix)/sizeof(IFloat) ) ;
      }

      tmp_mat2->DotMEqual(*(gauge+gauge_offset), *tmp_mat1) ;

      tmp_mat1->Dagger(*tmp_mat2) ;

      tmp_mat2->TrLessAntiHermMatrix(*tmp_mat1) ;

      *tmp_mat2 *= coeff ;

      *(mom+gauge_offset) += *tmp_mat2 ;

      Float norm = tmp_mat2->norm();
      Float tmp = sqrt(norm);
      L1 += tmp;
      L2 += norm;
      Linf = (tmp>Linf ? tmp : Linf);
    }

    } } } } // end for x,y,z,t
  } // end for mu
  for (mu=0; mu<4; mu++){
    if ( GJP.Nodes(mu) >1){
      Recv[mu].TransComplete();
      Send[mu].TransComplete();
    }
  }
 
//------------------------------------------------------------------
// start by summing first over direction (mu) and then over site
// to allow SCU transfers to happen face-by-face in the outermost
// loop.
//------------------------------------------------------------------

  for(int i = 0;i<4;i++){
    v1_buf_pos[i] = v1_buf[i];
    v2_buf_pos[i] = v2_buf[i];
  }

  for (mu=0; mu<4; mu++){
    for (pos[3]=0; pos[3]<size[3]; pos[3]++){
    for (pos[2]=0; pos[2]<size[2]; pos[2]++){
    for (pos[1]=0; pos[1]<size[1]; pos[1]++){
    for (pos[0]=0; pos[0]<size[0]; pos[0]++){
      int gauge_offset = offset(size,pos);
      int vec_offset = f_site_size_4d*gauge_offset ;
      gauge_offset = mu+4*gauge_offset ;

      Float *v1_plus_mu ;
      Float *v2_plus_mu ;
      int vec_plus_mu_stride ;
      int vec_plus_mu_offset = f_site_size_4d ;

      Float coeff = -2.0 * dt ;

//      printf("%d %d %d %d %d\n",mu,pos[0],pos[1],pos[2],pos[3]);
          vec_plus_mu_offset *= offset(size,pos,mu);
      if ((GJP.Nodes(mu)>1)&&((pos[mu]+1) == size[mu]) ) {
            v1_plus_mu = v1_buf_pos[mu] ;
            v2_plus_mu = v2_buf_pos[mu] ;
 			v1_buf_pos[mu] += f_site_size_4d;
 			v2_buf_pos[mu] += f_site_size_4d;
            vec_plus_mu_stride = (surf[mu] -1)*f_site_size_4d ;
            if (GJP.NodeBc(mu)==BND_CND_APRD) coeff = -coeff ;

//     Float time2 = -dclock();
      sproj_tr[mu]( (IFloat *)tmp_mat1,
                    (IFloat *)v1_plus_mu,
                    (IFloat *)v2+vec_offset,
                    ls, vec_plus_mu_stride, f_size_4d-f_site_size_4d) ;

      sproj_tr[mu+4]( (IFloat *)tmp_mat2,
                      (IFloat *)v2_plus_mu,
                      (IFloat *)v1+vec_offset,
                      ls, vec_plus_mu_stride, f_size_4d-f_site_size_4d) ;
 //    time2 += dclock();
 //    print_flops("","sproj",2*9*16*ls,time2);

      *tmp_mat1 += *tmp_mat2 ;

      // If GJP.Snodes > 1 sum up contributions from all s nodes
      if(GJP.Snodes() >1 ) {
	  glb_sum_multi_dir((Float *)tmp_mat1, 4, sizeof(Matrix)/sizeof(IFloat) ) ;
      }

      tmp_mat2->DotMEqual(*(gauge+gauge_offset), *tmp_mat1) ;

      tmp_mat1->Dagger(*tmp_mat2) ;

      tmp_mat2->TrLessAntiHermMatrix(*tmp_mat1) ;

      *tmp_mat2 *= coeff ;

      *(mom+gauge_offset) += *tmp_mat2 ;
      Float norm = tmp_mat2->norm();
      Float tmp = sqrt(norm);
      L1 += tmp;
      L2 += norm;
      Linf = (tmp>Linf ? tmp : Linf);
    }

    } } } } // end for x,y,z,t
  } // end for mu

  ForceFlops += (2*9*16*ls + 18+ 198+36+24)*lx*ly*lz*lt*4;
#ifdef PROFILE
  time += dclock();
  print_flops(cname,fname,ForceFlops,time);
#endif
 
//------------------------------------------------------------------
// deallocate smalloc'd space
//------------------------------------------------------------------

  for(int i =0;i<4;i++) {
    ffree(v1_buf[i],cname,fname,"v1_buf");
    ffree(v2_buf[i],cname,fname,"v2_buf");
  }
 
  VRB.Sfree(cname, fname, str_v2, v2) ;
  ffree(v2) ;
 
  VRB.Sfree(cname, fname, str_v1, v1) ;
  ffree(v1) ;

  ffree(tmp_mat1);
 
  glb_sum(&L1);
  glb_sum(&L2);
  glb_max(&Linf);

  L1 /= 4.0*GJP.VolSites();
  L2 /= 4.0*GJP.VolSites();

  return ForceArg(L1, sqrt(L2), Linf);
}