コード例 #1
0
ファイル: vector_util.C プロジェクト: DeanHowarth/QUDA-CPS
CPS_START_NAMESPACE
/*!\file
  \brief  Definitions of functions that perform operations on complex matrices
  and vectors.

  $Id: vector_util.C,v 1.10 2013-04-19 20:25:52 chulwoo Exp $
*/
//--------------------------------------------------------------------
//  CVS keywords
//
//  $Author: chulwoo $
//  $Date: 2013-04-19 20:25:52 $
//  $Header: /home/chulwoo/CPS/repo/CVS/cps_only/cps_pp/src/util/vector/comsrc/vector_util.C,v 1.10 2013-04-19 20:25:52 chulwoo Exp $
//  $Id: vector_util.C,v 1.10 2013-04-19 20:25:52 chulwoo Exp $
//  $Name: not supported by cvs2svn $
//  $Locker:  $
//  $Revision: 1.10 $
//  $Source: /home/chulwoo/CPS/repo/CVS/cps_only/cps_pp/src/util/vector/comsrc/vector_util.C,v $
//  $State: Exp $
//
//--------------------------------------------------------------------
/*------------------------------------------------------------------*/
/*
   For these functions there exists optimized assembly 
   code.
*/
/*------------------------------------------------------------------*/

CPS_END_NAMESPACE
#include <string.h>		/* memcpy */
#include <util/vector.h>
#include <util/time_cps.h>
//#include<omp.h>
CPS_START_NAMESPACE


/*!
  \param b The vector to be copied to
  \param a The vector to be copied from.
  \param len The number of bytes to be copied.

   The arrays \a c and \a b must not alias each other.
*/
//---------------------------------------------------------------//
void moveMem(void *b, const void *a, int len) 
{
#undef PROFILE
#ifdef PROFILE
    double time  = -dclock();
#endif
    memcpy(b, a, len); 
#ifdef PROFILE
    time += dclock();
    print_flops("","moveMem",len,time);
#endif
}
コード例 #2
0
ファイル: pt_vec.C プロジェクト: DeanHowarth/QUDA-CPS
//Parallel transport of a vector through one hop
void PT::vec(int n, IFloat **vout, IFloat **vin, const int *dir){
  int i;
  static int call_num=0;
  SCUDirArgIR *SCUarg_p[2*n];
  call_num++;
  //for(int s = 0; s < GJP.VolNodeSites(); s++)
  //  {
  //    for(int t = 0; t < 4; t++)
  //	{
  //	  printf("site = %d, direction = %d\n",s,t);
  //	  for(int u = 0; u < 9; u++)
  //	    printf("%e %e\n",*(gauge_field_addr+4*GAUGE_LEN*s + GAUGE_LEN*t + 2*u),*(gauge_field_addr+4*GAUGE_LEN*s + GAUGE_LEN*t + 2*u+1));
  //	}
  //  }

#ifdef PROFILE
  Float dtime  = - dclock();
#endif
  int wire[n];
  SCUDirArgMulti SCUmulti;

  char *fname="pt_1vec";
//  VRB.Func("",fname);
	
  int non_local_dir=0;
  for(i=0;i<n;i++) wire[i] = dir[i]; // from (x,y,z,t) to (t,x,y,z)
//  for(i=0;i<n;i++) printf("wire[%d]=%d\n",i,dir[i]);
  for(i=0;i<n;i++)
  if (!local[wire[i]/2]){
    IFloat * addr = (vin[i]+VECT_LEN*offset[wire[i]]);
    SCUarg_p[2*non_local_dir] = SCUarg[0][2*wire[i]];
    SCUarg_p[2*non_local_dir+1] = SCUarg[0][2*wire[i]+1];
    SCUarg_p[2*non_local_dir+1]->Addr((void *)addr);
    non_local_dir++;
  }
  if(non_local_dir){
    SCUmulti.Init(SCUarg_p,non_local_dir*2);
    SCUmulti.SlowStartTrans();
  }
	
  for(i=0;i<n;i++) 
    partrans_cmv_agg(local_chi[wire[i]],(long)uc_l[wire[i]], (long)vin[i],(long)vout[i]);
	
  if(non_local_dir){ SCUmulti.TransComplete(); }

  for(i=0;i<n;i++) 
    partrans_cmv_agg(non_local_chi[wire[i]],(long)uc_nl[wire[i]], (long)rcv_buf[wire[i]],(long)vout[i]);

#ifdef PROFILE
  dtime +=dclock();
  print_flops("",fname,66*n*vol,dtime);
#endif
  Flops +=66*n*vol;
}
コード例 #3
0
ファイル: vector_util.C プロジェクト: DeanHowarth/QUDA-CPS
void moveVec(Float *b, const Float *a, int len) {
#undef PROFILE
#ifdef PROFILE
    double time  = -dclock();
#endif
//    for(int i =0;i<len*6;i++) *b++ = *a++; 
    memcpy(b, a, len*sizeof(Vector)); 
#ifdef PROFILE
    time += dclock();
    print_flops("","moveVec",len*sizeof(Float),time);
#endif
}
コード例 #4
0
ForceArg GimprRect::EvolveMomGforce(Matrix *mom, Float dt){
  char *fname = "EvolveMomGforce(M*,F)";
  VRB.Func(cname,fname);

  Float L1=0.0;
  Float L2=0.0;
  Float Linf=0.0;

#ifdef PROFILE
  Float time = -dclock();
  ForceFlops = 0;
#endif
  
  setCbufCntrlReg(4, CBUF_MODE4);

  int x[4];
  
  for(x[0] = 0; x[0] < GJP.XnodeSites(); ++x[0])
  for(x[1] = 0; x[1] < GJP.YnodeSites(); ++x[1])
  for(x[2] = 0; x[2] < GJP.ZnodeSites(); ++x[2])
  for(x[3] = 0; x[3] < GJP.TnodeSites(); ++x[3]) {

    int uoff = GsiteOffset(x);

    for (int mu = 0; mu < 4; ++mu) {
      GforceSite(*mp0, x, mu);

      IFloat *ihp = (IFloat *)(mom+uoff+mu);
      IFloat *dotp = (IFloat *)mp0;
      fTimesV1PlusV2(ihp, dt, dotp, ihp, 18);
      Float norm = ((Matrix*)dotp)->norm();
      Float tmp = sqrt(norm);
      L1 += tmp;
      L2 += norm;
      Linf = (tmp>Linf ? tmp : Linf);
   }
  }
  ForceFlops +=GJP.VolNodeSites()*4*18*2;
#ifdef PROFILE
  time += dclock();
  print_flops(cname,fname,ForceFlops,time);
#endif

  glb_sum(&L1);
  glb_sum(&L2);
  glb_max(&Linf);

  L1 /= 4.0*GJP.VolSites();
  L2 /= 4.0*GJP.VolSites();

  VRB.FuncEnd(cname,fname);
  return ForceArg(dt*L1, dt*sqrt(L2), dt*Linf);
}
コード例 #5
0
ファイル: alg_momentum.C プロジェクト: DeanHowarth/QUDA-CPS
//!< Calculate gauge contribution to the Hamiltonian
Float AlgMomentum::energy() {
  Float dtime = -dclock();

  const char *fname = "energy()";
  Lattice &lat = LatticeFactory::Create(F_CLASS_NONE, G_CLASS_NONE);
  Float h = lat.MomHamiltonNode(mom);
  LatticeFactory::Destroy();

  dtime += dclock();
  print_flops(cname, fname, 0, dtime);

  return h;
}
コード例 #6
0
ファイル: alg_momentum.C プロジェクト: DeanHowarth/QUDA-CPS
//!< evolve method evolves the gauge field due to the momentum
void AlgMomentum::evolve(Float dt, int steps) 
{
  const char *fname = "evolve()";
  Float dtime = -dclock();

  Lattice &lat = LatticeFactory::Create(F_CLASS_NONE, G_CLASS_NONE);
  for (int i=0; i<steps; i++) lat.EvolveGfield(mom, dt);
  lat.MdTimeInc(dt*steps);
  VRB.Flow(cname,fname,"%s%f\n", md_time_str, IFloat(lat.MdTime()));
  LatticeFactory::Destroy();

  dtime += dclock();
  print_flops(cname, fname, 1968. * 4. * GJP.VolNodeSites() * steps, dtime);
}
コード例 #7
0
ファイル: vector_util.C プロジェクト: DeanHowarth/QUDA-CPS
void moveFloat(Float *b, const Float *a, int len) {
#undef PROFILE
#ifdef PROFILE
    double time  = -dclock();
#endif

#ifdef USE_OMP
#pragma omp parallel for
    for(int i =0;i<len;i++) b[i] = a[i];
#else
    memcpy(b, a, len*sizeof(Float)); 
#endif
#ifdef PROFILE
    time += dclock();
    print_flops("","moveFloat",len*sizeof(Float),time);
#endif
}
コード例 #8
0
ファイル: alg_momentum.C プロジェクト: DeanHowarth/QUDA-CPS
//!< Heat Bath for the conjugate momentum
void AlgMomentum::heatbath() {

  const char *fname = "heatbath()";
  Float dtime = -dclock();

  Lattice &lat = LatticeFactory::Create(F_CLASS_NONE, G_CLASS_NONE);
  lat.RandGaussAntiHermMatrix(mom, 1.0);

  //!< reset MD time in Lattice (a momentum refresh means a new trajectory)
  lat.MdTime(0.0);
  VRB.Flow(cname,fname,"%s%f\n", md_time_str, IFloat(lat.MdTime()));
      
  LatticeFactory::Destroy();
  
  dtime += dclock();
  print_flops(cname, fname, 0, dtime);
}
コード例 #9
0
// The outer product of v and w, multiplied by the parity factor and
// coefficient, and added to the force f.
// N.B. The force term is multiplied by -1 on ODD parity sites. This the
// the MILC convention and is here to test against the MILC code.
// This should eventually be changed to EVEN for the CPS.
void Fasqtad::force_product_sum(const Vector *v, const Vector *w,
				    IFloat coeff, Matrix *f){

  static int vol  = 0;
  char *fname = "force_product_sum(*V,*V,F,*M)";
  if (vol==0)
    vol = GJP.XnodeSites() * GJP.YnodeSites() * GJP.ZnodeSites() * GJP.TnodeSites() ;
  ForceFlops +=78*vol;
  unsigned long v2 = (unsigned long)v;
  if( qalloc_is_fast((Vector *)v) &&
      qalloc_is_fast((Vector *)w) &&
      qalloc_is_fast((Matrix *)f) )
    v2 = v2 - 0xb0000000 + 0x9c000000;
  
#ifdef PROFILE
  Float dtime = -dclock();
#endif
  IFloat coeff2 = 2.0*coeff;
  Force_cross2dag((Vector *)v2, w, f, vol/2, &coeff2);
#ifdef PROFILE
  dtime += dclock();
  print_flops(cname,fname,78*vol,dtime);
#endif
#if 0
  {IFloat *tmp = (IFloat *)f;
   printf("result[0]=");
   for(int i = 0;i<18;i++){
	printf(" %0.8e",*(tmp++));
	if(i%6==5) printf("\n");
   }
   tmp = (IFloat *)&(f[vol-1]);
   printf("result[%d]=",vol-1);
   for(int i = 0;i<18;i++){
	printf(" %0.8e",*(tmp++));
	if(i%6==5) printf("\n");
   }
  }
  exit(54);
#endif
}
コード例 #10
0
int main(int argc, char **argv)
{
	int n; //problem size
	double *a, *b, *c;

	int mem_size;

	int i, j, k;
	int ii, jj, kk;
	 
	double para_time_result;


	int block_size = 16;
	int num_thread = 0;


	n = 500; /* default problem size */
	double flops;

	/* Default values if nothing is given by command prompt */
	if(argc > 1){
		n = atoi(argv[1]);
	}
	if(argc > 2){
		block_size = atoi(argv[2]);
	}



	/* Dynamic memory allocation */
	mem_size = n * n * sizeof(double);
	a = (double*)malloc(mem_size);
	b = (double*)malloc(mem_size);
	c = (double*)malloc(mem_size);
	if(0 == a || 0 == b || 0 == c){
		printf("memory allocation failed");
		return 0;
	}

	/* initialisation */
	for (i = 0; i < n; i++){
		for (j = 0; j < n; j++){
			*(a + i * n + j) = (double)i + (double)j;
			*(b + i * n + j) = (double)(n - i) + (double)(n - j);
		}
	}


	memset(c, 0, mem_size);


	/* Parallelizing matrix multiplication with OpenMP where each cache-block is handled by only one thread starts here*/
	time_marker_t time = get_time();
	#pragma omp parallel default(none) shared(block_size, n, a, b, c) private(i, j, k, ii, jj, kk) reduction(+:num_thread)
	{
		num_thread += 1;

		#pragma omp for schedule(dynamic)
		for(i = 0; i < n; i += block_size){
			for(j = 0; j < n; j += block_size){
				for(k = 0; k < n; k += block_size){

					for(ii = i; ii < min(i + block_size, n); ii++) {
						for(jj = j; jj < min(j + block_size, n); jj++) {
							for(kk = k; kk < min(k + block_size, n); kk++) {
								c[ii * n + jj] += a[ii * n + kk] * b[kk * n + jj];
							}
						}
					}
				}
			}
		}


		#pragma omp barrier
  	}

	flops = 2.0 * n * n * n;

	para_time_result = print_flops(flops, time);
	printf("\nOpenMP: Problem size = %d, cache block size = %d, num_threads = %d, time=%g\n",
                                         n, block_size, num_thread, para_time_result);
	printf("****************************************************************************************\n");

	/* matrix multiplication with OpenMP ends here */


	return(0);
}
コード例 #11
0
ファイル: pt_mat.C プロジェクト: DeanHowarth/QUDA-CPS
void PT::mat_cb_norm(int n, IFloat **mout, IFloat **min, const int *dir, int
parity, IFloat * gauge)
{
  //List of the different directions
  int wire[MAX_DIR];
  int i;
//  printf("PT::mat_cb_norm\n");

  QMP_msgmem_t *msg_mem_p = (QMP_msgmem_t *)Alloc("","vec_cb_norm", "msg_mem_p", 2*non_local_dirs*sizeof(QMP_msgmem_t));
  QMP_msghandle_t* msg_handle_p = (QMP_msghandle_t *)Alloc("","vec_cb_norm", "msg_handle_p", 2*non_local_dirs*sizeof(QMP_msghandle_t));
  QMP_msghandle_t multiple;
  static int call_num = 0;
  int vlen = VECT_LEN;
  int vlen2 = VECT_LEN;

  call_num++;
  
  //Name our function
  char *fname="pt_mat_cb()";
  //  VRB.Func("",fname);
  
  //Set the transfer directions
  //If wire[i] is even, then we have communication in the negative direction
  //If wire[i] is odd, then we have communication in the positive direction
  for(i=0;i<n;i++)
    wire[i]=dir[i];

#ifdef PROFILE
  Float dtime  = - dclock();
#endif
  int non_local_dir=0;

//#pragma omp parallel default(shared)
{

  //If wire[i] is odd, then we have parallel transport in the
  //positive direction.  In this case, multiplication by the link matrix is
  //done before the field is transferred over to the adjacent node
  //
  //If we have transfer in the negative T direction (wire[i] = 6), then
  //we have to copy the appropriate fields to a send buffer
//#pragma omp for
  for(i=0;i<n;i++)
    {
      if(!local[wire[i]/2])
      {
	if(wire[i]%2)
	  {
	    if(conjugated)
	      pt_cmm_cpp(non_local_chi_cb[wire[i]],(long)uc_nl_cb_pre[parity][wire[i]/2],(long)min[i],(long)snd_buf_cb[wire[i]/2],(long)gauge);
	    else
	      pt_cmm_dag_cpp(non_local_chi_cb[wire[i]],(long)uc_nl_cb_pre[parity][wire[i]/2],(long)min[i],(long)snd_buf_cb[wire[i]/2],(long)gauge);
	  }
	else if((wire[i] == 6))
	  {
	    for(int j = 0; j < non_local_chi_cb[6];j++)
	      memcpy(snd_buf_t_cb + j*GAUGE_LEN,min[i] + 3 * *(Toffset[parity]+j)*3,GAUGE_LEN*sizeof(IFloat));
	  }
      }
    }

//#pragma omp barrier
//#pragma omp master 
{
  for(i=0;i<n;i++)
    if(!local[wire[i]/2])
    {
      //Calculate the starting address for the data to be sent
      IFloat *addr = min[i] + GAUGE_LEN * offset_cb[wire[i]];

      msg_mem_p[2*non_local_dir] = QMP_declare_msgmem((void *)rcv_buf[wire[i]], 3*non_local_chi_cb[wire[i]]*VECT_LEN*sizeof(IFloat));
      
      //Initialize the msg_mem for sends
      if(wire[i]%2) 
	msg_mem_p[2*non_local_dir+1] = QMP_declare_msgmem((void *)snd_buf_cb[wire[i]/2], 3*non_local_chi_cb[wire[i]]*VECT_LEN*sizeof(IFloat));
      else if(wire[i] == 6)
	msg_mem_p[2*non_local_dir+1] = QMP_declare_msgmem((void *)snd_buf_t_cb, 3*non_local_chi_cb[wire[i]]*VECT_LEN*sizeof(IFloat));
      else
	msg_mem_p[2*non_local_dir+1] = QMP_declare_strided_msgmem((void *)addr, (size_t)(3*blklen_cb[wire[i]]), numblk_cb[wire[i]], (ptrdiff_t)(3*stride_cb[wire[i]]+3*blklen_cb[wire[i]]));
      
      msg_handle_p[2*non_local_dir] = QMP_declare_receive_relative(msg_mem_p[2*non_local_dir], wire[i]/2, 1-2*(wire[i]%2), 0);
      msg_handle_p[2*non_local_dir+1] = QMP_declare_send_relative(msg_mem_p[2*non_local_dir+1], wire[i]/2, 2*(wire[i]%2)-1, 0);

      non_local_dir++;

    }

  if(non_local_dir) {
    multiple = QMP_declare_multiple(msg_handle_p, 2*non_local_dir);
    QMP_start(multiple);
  }
} //#pragma omp master {

  //Do local calculations
//#pragma omp for
  for(i=0;i<n;i++)
    {
      if((wire[i]%2 && conjugated) || ((wire[i]%2 == 0) && (conjugated == 0)))
	pt_cmm_cpp(local_chi_cb[wire[i]],(long)uc_l_cb[parity][wire[i]],(long)min[i],(long)mout[i],(long)gauge);
      else
	pt_cmm_dag_cpp(local_chi_cb[wire[i]],(long)uc_l_cb[parity][wire[i]],(long)min[i],(long)mout[i],(long)gauge);
    }

//#pragma omp barrier
//#pragma omp master 
{
  if(non_local_dir) {
    QMP_status_t qmp_complete_status = QMP_wait(multiple);
    if (qmp_complete_status != QMP_SUCCESS)
          QMP_error("Send failed in vec_cb_norm: %s\n", QMP_error_string(qmp_complete_status));
    QMP_free_msghandle(multiple);
    for(int i = 0; i < 2*non_local_dir; i++)
      QMP_free_msgmem(msg_mem_p[i]);
    Free(msg_handle_p);
    Free(msg_mem_p);
  }
} //#pragma omp master {

  //If wire[i] is even, then we have transport in the negative direction
  //In this case, the vector field is multiplied by the SU(3) link matrix
  //after all communication is complete
  IFloat *fp0,*fp1;
//#pragma omp for
  for(i=0;i<n;i++)
    {
      if(!local[wire[i]/2])
      	{
	  if(!(wire[i]%2))
	    {
	      if(conjugated)
		pt_cmm_dag_cpp(non_local_chi_cb[wire[i]],(long)uc_nl_cb[parity][wire[i]],(long)rcv_buf[wire[i]],(long)mout[i],(long)gauge);
	      else
		pt_cmm_cpp(non_local_chi_cb[wire[i]],(long)uc_nl_cb[parity][wire[i]],(long)rcv_buf[wire[i]],(long)mout[i],(long)gauge);
	    }
	  //Otherwise we have parallel transport in the positive direction.
	  //In this case, the received data has already been pre-multiplied
	  //All we need to do is to put the transported field in the correct place
	  else
	    {
	      //int destination, source;
	      //Place the data in the receive buffer into the result vector
	      for(int s=0;s<non_local_chi_cb[wire[i]];s++)
		{
		  //source = uc_nl_cb[parity][wire[i]][s].src;
		  fp0 = (IFloat *)((long)rcv_buf[wire[i]]+3*uc_nl_cb[parity][wire[i]][s].src);
		  //destination = uc_nl_cb[parity][wire[i]][s].dest;
		  fp1 = (IFloat *)(mout[i]+3*uc_nl_cb[parity][wire[i]][s].dest);
		  memcpy(fp1,fp0,GAUGE_LEN*sizeof(IFloat));
		}
	    }
	}
    }

} //#pragma omp parallel
#ifdef PROFILE
  dtime +=dclock();
  print_flops("",fname,99*vol*n,dtime);
#endif
//  ParTrans::PTflops +=99*n*vol;
}
コード例 #12
0
ファイル: pt_mat.C プロジェクト: DeanHowarth/QUDA-CPS
void PT::mat(int n, matrix **mout, matrix **min, const int *dir){
    
  int wire[MAX_DIR];
  int i;
  QMP_msgmem_t msg_mem_p[2*MAX_DIR];
  QMP_msghandle_t msg_handle_p[2*MAX_DIR];
  QMP_msghandle_t multiple;
  static double setup=0.,qmp=0.,localt=0.,nonlocal=0.;
  static int call_num = 0;

  call_num++;
//  char *fname="pt_mat()";
//  VRB.Func("",fname);
//  if (call_num%100==1) printf("PT:mat()\n");

  
  for(i=0;i<n;i++) wire[i] = dir[i]; 
#ifdef PROFILE
  Float dtime2  = - dclock();
#endif

  double dtime = -dclock();

  int non_local_dir=0;
  
  for(i=0;i<n;i++)
  if (!local[wire[i]/2]) {
    //Calculate the address for transfer in a particular direction
    Float * addr = ((Float *)min[i]+GAUGE_LEN*offset[wire[i]]);
    msg_mem_p[2*non_local_dir] = QMP_declare_msgmem((void *)rcv_buf[wire[i]], 3*non_local_chi[wire[i]]*VECT_LEN*sizeof(IFloat));
    msg_mem_p[2*non_local_dir+1] = QMP_declare_strided_msgmem((void *)addr, (size_t)(3*blklen[wire[i]]), numblk[wire[i]], (ptrdiff_t)(3*stride[wire[i]]+3*blklen[wire[i]]));
    
    msg_handle_p[2*non_local_dir] = QMP_declare_receive_relative(msg_mem_p[2*non_local_dir], wire[i]/2, 1-2*(wire[i]%2), 0);
    msg_handle_p[2*non_local_dir+1] = QMP_declare_send_relative(msg_mem_p[2*non_local_dir+1], wire[i]/2, 2*(wire[i]%2)-1, 0);

    non_local_dir++;
  }
  if (call_num==1 && !QMP_get_node_number())
	printf("non_local_dir=%d\n",non_local_dir);

  if(non_local_dir) {
    multiple = QMP_declare_multiple(msg_handle_p, 2*non_local_dir);
    QMP_start(multiple);
  }

  dtime += dclock();
  setup +=dtime;
  dtime = -dclock();
  int if_print = 0;
//  if ( (call_num%10000==1) && (!QMP_get_node_number()) ) if_print=1;

#define USE_TEST2
#ifdef USE_TEST2
//assume nt > n!
    static char *cname="mat()";
#pragma omp parallel default(shared)
{
  int iam,nt,ipoints,istart,offset;
  iam = omp_get_thread_num();
  nt = omp_get_num_threads();
  int nt_dir = nt/n;
  int n_t = iam/nt_dir;
  int i_t = iam%nt_dir;
  if (n_t >= n ){  n_t = n-1;
    i_t = iam - (n-1)*nt_dir;
    nt_dir = nt -(n-1)*nt_dir;
  }
  int w_t = wire[n_t];
  ipoints = (local_chi[w_t]/2)/nt_dir;
  offset = ipoints*i_t;
  if (i_t == (nt_dir-1)) ipoints = (local_chi[w_t]/2)-offset;
    if ( if_print )
      printf("thread %d of %d nt_dir n_t i_t ipoints offset= %d %d %d %d %d\n",iam,nt,nt_dir,n_t,i_t,ipoints,offset);
  //Interleaving of local computation of matrix multiplication
  partrans_cmm_agg((uc_l[w_t]+offset*2),min[n_t],mout[n_t],ipoints);
    if ( if_print )
      printf("thread %d of %d done\n",iam,nt);
}
#else
{
  //Interleaving of local computation of matrix multiplication
#pragma omp parallel for default(shared)
  for(i=0;i<n;i++){
    partrans_cmm_agg(uc_l[wire[i]],min[i],mout[i],local_chi[wire[i]]/2);
  }
}
#endif

  dtime += dclock();
  localt +=dtime;
  dtime = -dclock();
//#pragma omp barrier
//#pragma omp master 
{
  if(non_local_dir) {
    QMP_status_t qmp_complete_status = QMP_wait(multiple);
    if (qmp_complete_status != QMP_SUCCESS)
          QMP_error("Send failed in vec_cb_norm: %s\n", QMP_error_string(qmp_complete_status));
    QMP_free_msghandle(multiple);
    for(int i = 0; i < 2*non_local_dir; i++)
      QMP_free_msgmem(msg_mem_p[i]);
//    Free(msg_handle_p);
//    Free(msg_mem_p);
  }
} //#pragma omp master {
  dtime += dclock();
  qmp +=dtime;
  dtime = -dclock();

  //Do non-local computations
#ifdef USE_TEST2
//assume nt > n!
#pragma omp parallel default(shared)
{
  int iam,nt,ipoints,istart,offset;
  iam = omp_get_thread_num();
  nt = omp_get_num_threads();
  int nt_dir = nt/n;
  int n_t = iam/nt_dir;
  int i_t = iam%nt_dir;
  if (n_t >= n ){  n_t = n-1;
    i_t = iam - (n-1)*nt_dir;
    nt_dir = nt -(n-1)*nt_dir;
  }
  int w_t = wire[n_t];
  ipoints = (non_local_chi[w_t]/2)/nt_dir;
  offset = ipoints*i_t;
  if (i_t == (nt_dir-1)) ipoints = (non_local_chi[w_t]/2)-offset;
    if ( if_print )
      printf("thread %d of %d nt_dir n_t i_t ipoints offset= %d %d %d %d %d\n",iam,nt,nt_dir,n_t,i_t,ipoints,offset);
  //Non-local computation
  if (ipoints>0)
  partrans_cmm_agg((uc_nl[w_t]+offset*2),(matrix *)rcv_buf[w_t],mout[n_t],ipoints);
    if ( if_print )
      printf("thread %d of %d done\n",iam,nt);
}
#else
{
#pragma omp parallel for
  for(i=0;i<n;i++) 
  if (!local[wire[i]/2]) {
#ifdef USE_OMP
    if (call_num%10000==1 && !QMP_get_node_number() ) 
      printf("thread %d of %d i=%d\n",omp_get_thread_num(),omp_get_num_threads(),i);
#endif
    partrans_cmm_agg(uc_nl[wire[i]],(matrix *)rcv_buf[wire[i]],mout[i],non_local_chi[wire[i]]/2);
  }

}//#pragma omp parallel
#endif

  dtime += dclock();
  nonlocal +=dtime;

  if (call_num%100==0){
    static char *cname="mat()";
    if (!QMP_get_node_number() ) {
    print_flops("mat():local*100",0,localt);
    print_flops("mat():nonlocal*100",0,nonlocal);
    print_flops("mat():qmp*100",0,qmp);
    print_flops("mat():setup*100",0,setup);
    }
    localt=nonlocal=qmp=setup=0.;
  }


#ifdef PROFILE
  dtime2 +=dclock();
  print_flops("",fname,198*vol*n,dtime2);
#endif
//  ParTrans::PTflops +=198*n*vol;
}
コード例 #13
0
//------------------------------------------------------------------
// "Odd" fermion force evolution routine written by Chris Dawson, taken 
// verbatim, so performance will suck on qcdoc.
//------------------------------------------------------------------
ForceArg FdwfBase::EvolveMomFforce( Matrix* mom, // momenta
                               Vector* phi, // odd pseudo-fermion field
                               Vector* eta, // very odd pseudo-fermion field
                               Float  mass, 
                               Float dt )
{
  char *fname = "EvolveMomFforce(M*,V*,V*,F,F)";
  VRB.Func(cname,fname);
  
  Matrix *gauge = GaugeField() ;

  if (Colors() != 3)       { ERR.General(cname,fname,"Wrong nbr of colors.") ; }
  if (SpinComponents()!=4) { ERR.General(cname,fname,"Wrong nbr of spin comp.") ;}
  if (mom == 0)            { ERR.Pointer(cname,fname,"mom") ; }
  if (phi == 0)            { ERR.Pointer(cname,fname,"phi") ; }
   
  // allocate space for two CANONICAL fermion fields

  // these are all full fermion vector sizes ( i.e. *not* preconditioned )

  const int f_size        ( FsiteSize() * GJP.VolNodeSites() );
  const int f_size_cb     ( f_size/2 ) ; // f_size must be multiple of 2
  const int f_site_size_4d( 2 * Colors() * SpinComponents() );
  const int f_size_4d     ( f_site_size_4d * GJP.VolNodeSites()) ;
  
  char *str_v1 = "v1" ;
  Vector *v1 = (Vector *)smalloc(f_size*sizeof(Float)) ;
  if (v1 == 0) ERR.Pointer(cname, fname, str_v1) ;
  VRB.Smalloc(cname, fname, str_v1, v1, f_size*sizeof(Float)) ;

  char *str_v2 = "v2" ;
  Vector *v2 = (Vector *)smalloc(f_size*sizeof(Float)) ;
  if (v2 == 0) ERR.Pointer(cname, fname, str_v2) ;
  VRB.Smalloc(cname, fname, str_v2, v2, f_size*sizeof(Float)) ;

  Float L1 = 0.0;
  Float L2 = 0.0;
  Float Linf = 0.0;

#ifdef PROFILE
  Float time = -dclock();
  ForceFlops=0;
#endif

  //Calculate v1, v2. Both must be in CANONICAL order afterwards
  {
    CgArg cg_arg ;
    cg_arg.mass = mass ;
    
    DiracOpDwf dwf(*this, v1, v2, &cg_arg, CNV_FRM_YES) ;
    Float kappa( 1.0 / ( 2 * (4 + GJP.DwfA5Inv() - GJP.DwfHeight())));

    v2->CopyVec(phi,f_size_cb);

    // rescale the input field. As the second half of the this field
    // will be constructed by acting with the PC dslash on v1, this
    // rescales *one* of the full vectors - giving rise to an overall
    // rescaling of the final answer by exactly -\kappa^2
    
    v2->VecTimesEquFloat(-kappa*kappa,f_size_cb);

    // only need one factor of -\kappa^2, so don't rescale the second
    // full vector (v2)
    v1->CopyVec(eta,f_size_cb);
        
    dwf.Dslash(v2+(f_size_cb/6), v2 , CHKB_ODD, DAG_YES);
    dwf.Dslash(v1+(f_size_cb/6), v1 , CHKB_ODD, DAG_NO);
    
    // v1 and v2 are now the vectors needed to contruct the force term
    // written in ( ODD, EVEN ) ordering. They will be converted back
    // into canonical ordering when the destructor is called.
    
  }  

  // two fermion vectors at a single position
  //    - these will be used to store off-node
  //      field components

 
  char *str_site_v1 = "site_v1" ;
  Float *site_v1 = (Float *)smalloc(FsiteSize()*sizeof(Float)) ;
  if (site_v1 == 0) ERR.Pointer(cname, fname, str_site_v1) ;
  VRB.Smalloc(cname, fname, str_site_v1, site_v1, FsiteSize()*sizeof(Float)) ;

  char *str_site_v2 = "site_v2" ;
  Float *site_v2 = (Float *)smalloc(FsiteSize()*sizeof(Float)) ;
  if (site_v2 == 0) ERR.Pointer(cname, fname, str_site_v2) ;
  VRB.Smalloc(cname, fname, str_site_v2, site_v2, FsiteSize()*sizeof(Float)) ;

  // evolve the momenta by the fermion force
  int mu, x, y, z, t, s;
 
  const int lx(GJP.XnodeSites());
  const int ly(GJP.YnodeSites());
  const int lz(GJP.ZnodeSites());
  const int lt(GJP.TnodeSites());
  const int ls(GJP.SnodeSites());
  
  // start by summing first over direction (mu) and then over site to
  // allow SCU transfers to happen face-by-face in the outermost loop.

  VRB.Clock(cname, fname, "Before loop over links.\n") ;

  for (mu=0; mu<4; mu++) {
    for (t=0; t<lt; t++){
      for (z=0; z<lz; z++){
        for (y=0; y<ly; y++){
          for (x=0; x<lx; x++) {
            // position offset
            int gauge_offset = x+lx*(y+ly*(z+lz*t));
            
            // offset for vector field at this point
            // (4d only, no fifth dimension)
            int vec_offset = f_site_size_4d*gauge_offset ;
            
            // offset for link in mu direction from this point
            gauge_offset = mu+4*gauge_offset ; 
            
            Float *v1_plus_mu=NULL ;
            Float *v2_plus_mu=NULL ;
            int vec_plus_mu_stride=0 ;
            int vec_plus_mu_offset = f_site_size_4d ;
            
            // sign of coeff (look at momenta update)
            Float coeff = -2.0 * dt ;
            
            switch (mu) 
              {
              case 0 :
                // next position in mu direction
                vec_plus_mu_offset *= (x+1)%lx+lx*(y+ly*(z+lz*t)) ;
                // vec_plus_mu_offset now the correct
                // offset for a fermion field at this point
                // in the lattice 
                if ((x+1) == lx) 
                  {
                    // off-node
                    for (s=0; s<ls; s++) 
                      {
                        // fill site_v1 and site_v2 with v1 and v2 data
                        // from x=0 on next node, need loop because
                        // data is not contiguous in memory 
                        getPlusData( (Float *)site_v1+s*f_site_size_4d,
                                     (Float *)v1+vec_plus_mu_offset+s*f_size_4d,
                                     f_site_size_4d, mu) ;
                        getPlusData( (Float*)site_v2+s*f_site_size_4d,
                                     (Float*)v2+vec_plus_mu_offset+s*f_size_4d,
                                     f_site_size_4d, mu) ;
                      } // end for s
                    
                    v1_plus_mu = site_v1   ;  
                    v2_plus_mu = site_v2   ;  
                    vec_plus_mu_stride = 0 ;  // field now contiguous
                    
                    // GJP.XnodeBc() gives the forward boundary
                    // condition only (so this should work).
                    if (GJP.XnodeBc()==BND_CND_APRD) coeff = -coeff ;
                  } 
                else 
                  {
                    // on - node
                    //
                    // just add offset to v1 and v2
                    // (they are now 1 forward in the mu direction )
                    //
                    v1_plus_mu = (Float*)v1+vec_plus_mu_offset ;
                    v2_plus_mu = (Float*)v2+vec_plus_mu_offset ;
                    vec_plus_mu_stride = f_size_4d - f_site_size_4d ; // explained below
                  }
                break ;
                // Repeat for the other directions
              case 1 :
                vec_plus_mu_offset *= x+lx*((y+1)%ly+ly*(z+lz*t)) ;
                if ((y+1) == ly) 
                  {
                    for (s=0; s<ls; s++) 
                      {
                        getPlusData( (Float*)site_v1+s*f_site_size_4d,
                                     (Float*)v1+vec_plus_mu_offset+s*f_size_4d,
                                     f_site_size_4d, mu) ;
                        getPlusData( (Float*)site_v2+s*f_site_size_4d,
                                     (Float*)v2+vec_plus_mu_offset+s*f_size_4d,
                                     f_site_size_4d, mu) ;
                      }
                    v1_plus_mu = site_v1 ;
                    v2_plus_mu = site_v2 ;
                    vec_plus_mu_stride = 0 ;
                    if (GJP.YnodeBc()==BND_CND_APRD) coeff = -coeff ;
                  } 
                else 
                  {
                    v1_plus_mu = (Float *)v1+vec_plus_mu_offset ;
                    v2_plus_mu = (Float *)v2+vec_plus_mu_offset ;
                    vec_plus_mu_stride = f_size_4d - f_site_size_4d ;
                  }
                break ;
              case 2 :
                vec_plus_mu_offset *= x+lx*(y+ly*((z+1)%lz+lz*t)) ;
                if ((z+1) == lz) 
                  {
                    for (s=0; s<ls; s++) 
                      {
                        getPlusData( (Float*)site_v1+s*f_site_size_4d,
                                     (Float*)v1+vec_plus_mu_offset+s*f_size_4d,
                                     f_site_size_4d, mu) ;
                        getPlusData( (Float*)site_v2+s*f_site_size_4d,
                                     (Float*)v2+vec_plus_mu_offset+s*f_size_4d,
                                     f_site_size_4d, mu) ;
                      }
                    v1_plus_mu = site_v1 ;
                    v2_plus_mu = site_v2 ;
                    vec_plus_mu_stride = 0 ;
                    if (GJP.ZnodeBc()==BND_CND_APRD) coeff = -coeff ;
                  } 
                else 
                  {
                    v1_plus_mu = (Float *)v1+vec_plus_mu_offset ;
                    v2_plus_mu = (Float *)v2+vec_plus_mu_offset ;
                    vec_plus_mu_stride = f_size_4d - f_site_size_4d ;
                  }
                break ;
              case 3 :
                vec_plus_mu_offset *= x+lx*(y+ly*(z+lz*((t+1)%lt))) ;
                if ((t+1) == lt) 
                  {
                    for (s=0; s<ls; s++) 
                      {
                        getPlusData( (Float*)site_v1+s*f_site_size_4d,
                                     (Float*)v1+vec_plus_mu_offset+s*f_size_4d,
                                     f_site_size_4d, mu) ;
                        getPlusData( (Float*)site_v2+s*f_site_size_4d,
                                     (Float*)v2+vec_plus_mu_offset+s*f_size_4d,
                                     f_site_size_4d, mu) ;
                      } 
                    v1_plus_mu = site_v1 ;
                    v2_plus_mu = site_v2 ;
                    vec_plus_mu_stride = 0 ;
                    if (GJP.TnodeBc()==BND_CND_APRD) coeff = -coeff ;
                  } 
                else 
                  {
                    v1_plus_mu = (Float *)v1+vec_plus_mu_offset ;
                    v2_plus_mu = (Float *)v2+vec_plus_mu_offset ;
                    vec_plus_mu_stride = f_size_4d - f_site_size_4d ;
                  }
              } // end (the evil) mu switch 


            Matrix tmp_mat1, tmp_mat2;  

            // the non-zero stride pattern is due to domain wall
            // fermions ( summing up *ls* different sproj's )
            //
            // f_size_4d-f_site_size_4d is the number of floats
            // between the end of one spinor at s and the start of the 
            // spinor at s+1 
            // 
            // vec_plus_mu_stride is the same, except when
            // this is off boundary, in that case the info
            // is copied into a contiguous block in the above code
            // and vec_plus_mu_stride set to zero
            
            // ( 1 - gamma_\mu ) Tr_s [ v1(x+\mu) v2^{\dagger}(x) ]
            
            sproj_tr[mu]( (Float *)&tmp_mat1,
                          (Float *)v1_plus_mu,
                          (Float *)v2+vec_offset,
                          ls, vec_plus_mu_stride, f_size_4d-f_site_size_4d) ;
            
            // (1 + gamma_\mu)  Tr_s [ v2(x+\mu) v1^{\dagger}(x) ]
            sproj_tr[mu+4]( (Float *)&tmp_mat2,
                            (Float *)v2_plus_mu,
                            (Float *)v1+vec_offset,
                            ls, vec_plus_mu_stride, f_size_4d-f_site_size_4d) ;
            
            
            // exactly what is sounds like
            tmp_mat1 += tmp_mat2 ;
            
            if(GJP.Snodes() != 1) {
              for (s=0; s<(sizeof(Matrix)/sizeof(Float)); ++s) {
                glb_sum_dir((Float *)&tmp_mat1 + s, 4) ;
              }
            }
            
            // multiply sum by the link in the \mu direction
            tmp_mat2.DotMEqual(*(gauge+gauge_offset), tmp_mat1) ;
            
            // take tracless antihermitian piece
            // TrLessAntiHermMatrix need to be passed
            // the dagger of the matrix in question
            tmp_mat1.Dagger(tmp_mat2) ;
            tmp_mat2.TrLessAntiHermMatrix(tmp_mat1) ;

            tmp_mat2 *= coeff ;
            
            // note the minus sign.
            *(mom+gauge_offset) -= tmp_mat2 ;
	    Float norm = tmp_mat2.norm();
	    Float tmp = sqrt(norm);
	    L1 += tmp;
	    L2 += norm;
	    Linf = (tmp>Linf ? tmp : Linf);
	    
          } // end for x
        } // end for y
      } // end for z
    } // end for t
  } // end for mu
  ForceFlops += (2*9*16*ls + 18+ 198+36+24)*lx*ly*lz*lt*4;
#ifdef PROFILE
  time += dclock();
  print_flops(cname,fname,ForceFlops,time);
#endif
  
  // deallocate smalloc'd space

  VRB.Sfree(cname, fname, str_site_v2, site_v2) ;
  sfree(site_v2) ;
 
  VRB.Sfree(cname, fname, str_site_v1, site_v1) ;
  sfree(site_v1) ;
 
  VRB.Sfree(cname, fname, str_v2, v2) ;
  sfree(v2) ;
  
  VRB.Sfree(cname, fname, str_v1, v1) ;
  sfree(v1) ;

  glb_sum(&L1);
  glb_sum(&L2);
  glb_max(&Linf);

  L1 /= 4.0*GJP.VolSites();
  L2 /= 4.0*GJP.VolSites();

  VRB.FuncEnd(cname,fname);
  return ForceArg(L1, sqrt(L2), Linf);

}
コード例 #14
0
CPS_START_NAMESPACE
/*!\file
  \brief  Implementation of FdwfBase class.

  $Id: f_dwf_base_force.C,v 1.14 2012-08-31 04:55:08 chulwoo Exp $
*/
//--------------------------------------------------------------------
//  CVS keywords
//
//  $Source: /home/chulwoo/CPS/repo/CVS/cps_only/cps_pp/src/util/lattice/f_dwf_base/noarch/f_dwf_base_force.C,v $
//  $State: Exp $
//
//--------------------------------------------------------------------
//------------------------------------------------------------------
//
// f_dwf_base_force.C
//
// (R)HMC force term for FdwfBase
//
//------------------------------------------------------------------

CPS_END_NAMESPACE
#include <util/qcdio.h>
#include <math.h>
#include <util/lattice.h>
#include <util/dirac_op.h>
#include <util/dwf.h>
#include <util/gjp.h>
#include <util/verbose.h>
#include <util/vector.h>
#include <util/random.h>
#include <util/error.h>
#include <util/time_cps.h>
#include <comms/scu.h> // GRF
#include <comms/glb.h>
CPS_START_NAMESPACE
#undef PROFILE

// CJ: change start
//------------------------------------------------------------------
// EvolveMomFforce(Matrix *mom, Vector *chi, Float mass, 
//                 Float dt):
// It evolves the canonical momentum mom by dt
// using the fermion force.
//------------------------------------------------------------------
ForceArg FdwfBase::EvolveMomFforce(Matrix *mom, Vector *chi, 
			   Float mass, Float dt){
  char *fname = "EvolveMomFforce(M*,V*,F,F,F)";
  VRB.Func(cname,fname);

  Matrix *gauge = GaugeField() ;

  if (Colors() != 3)
    ERR.General(cname,fname,"Wrong nbr of colors.") ;
 
  if (SpinComponents() != 4)
    ERR.General(cname,fname,"Wrong nbr of spin comp.") ;
 
  if (mom == 0)
    ERR.Pointer(cname,fname,"mom") ;
 
  if (chi == 0)
    ERR.Pointer(cname,fname,"chi") ;
 
  //----------------------------------------------------------------
  // allocate space for two CANONICAL fermion fields
  //----------------------------------------------------------------

  int f_size = FsiteSize() * GJP.VolNodeSites() ;
  int f_site_size_4d = 2 * Colors() * SpinComponents();
  int f_size_4d = f_site_size_4d * GJP.VolNodeSites() ;
 
  char *str_v1 = "v1" ;
  Vector *v1 = (Vector *)smalloc(f_size*sizeof(Float)) ;
  if (v1 == 0) ERR.Pointer(cname, fname, str_v1) ;
  VRB.Smalloc(cname, fname, str_v1, v1, f_size*sizeof(Float)) ;

  char *str_v2 = "v2" ;
  Vector *v2 = (Vector *)smalloc(f_size*sizeof(Float)) ;
  if (v2 == 0) ERR.Pointer(cname, fname, str_v2) ;
  VRB.Smalloc(cname, fname, str_v2, v2, f_size*sizeof(Float)) ;

  //----------------------------------------------------------------
  // allocate buffer space for two fermion fields that are assoc
  // with only one 4-D site.
  //----------------------------------------------------------------

  char *str_site_v1 = "site_v1" ;
  Float *site_v1 = (Float *)smalloc(FsiteSize()*sizeof(Float)) ;
  if (site_v1 == 0) ERR.Pointer(cname, fname, str_site_v1) ;
  VRB.Smalloc(cname, fname, str_site_v1, site_v1, FsiteSize()*sizeof(Float)) ;

  char *str_site_v2 = "site_v2" ;
  Float *site_v2 = (Float *)smalloc(FsiteSize()*sizeof(Float)) ;
  if (site_v2 == 0) ERR.Pointer(cname, fname, str_site_v2) ;
  VRB.Smalloc(cname, fname, str_site_v2, site_v2, FsiteSize()*sizeof(Float)) ;

  Float L1 = 0.0;
  Float L2 = 0.0;
  Float Linf = 0.0;

  //----------------------------------------------------------------
  // Calculate v1, v2. Both v1, v2 must be in CANONICAL order after
  // the calculation.
  //----------------------------------------------------------------  

  VRB.Clock(cname, fname, "Before calc force vecs.\n") ;
  VRB.Flow(cname, fname, "Before calc force vecs.\n") ;

  {
    CgArg cg_arg ;
    cg_arg.mass = mass ;

    DiracOpDwf dwf(*this, v1, v2, &cg_arg, CNV_FRM_YES) ;
    dwf.CalcHmdForceVecs(chi) ;
  }
  VRB.Flow(cname, fname, "After calc force vecs.\n") ;
#ifdef PROFILE
  Float time = -dclock();
  ForceFlops=0;
#endif

  int mu, x, y, z, t, s, lx, ly, lz, lt, ls ;
 
  lx = GJP.XnodeSites() ;
  ly = GJP.YnodeSites() ;
  lz = GJP.ZnodeSites() ;
  lt = GJP.TnodeSites() ;
  ls = GJP.SnodeSites() ;

  Matrix tmp_mat1, tmp_mat2 ;
 
//------------------------------------------------------------------
// start by summing first over direction (mu) and then over site
// to allow SCU transfers to happen face-by-face in the outermost
// loop.
//------------------------------------------------------------------

  VRB.Clock(cname, fname, "Before loop over links.\n") ;

  for (mu=0; mu<4; mu++){
    for (t=0; t<lt; t++){
    for (z=0; z<lz; z++){
    for (y=0; y<ly; y++){
    for (x=0; x<lx; x++){
      int gauge_offset = x+lx*(y+ly*(z+lz*t)) ;
      int vec_offset = f_site_size_4d*gauge_offset ;
      gauge_offset = mu+4*gauge_offset ;

      Float *v1_plus_mu ;
      Float *v2_plus_mu ;
      int vec_plus_mu_stride ;
      int vec_plus_mu_offset = f_site_size_4d ;

      Float coeff = -2.0 * dt ;

      switch (mu) {
        case 0 :
          vec_plus_mu_offset *= (x+1)%lx+lx*(y+ly*(z+lz*t)) ;
          if ((x+1) == lx) {
            for (s=0; s<ls; s++) {
              getPlusData( (IFloat *)site_v1+s*f_site_size_4d,
                (IFloat *)v1+vec_plus_mu_offset+s*f_size_4d,
                f_site_size_4d, mu) ;
              getPlusData( (IFloat *)site_v2+s*f_site_size_4d,
                (IFloat *)v2+vec_plus_mu_offset+s*f_size_4d,
                f_site_size_4d, mu) ;
            } // end for s
            v1_plus_mu = site_v1 ;
            v2_plus_mu = site_v2 ;
            vec_plus_mu_stride = 0 ;
            if (GJP.XnodeBc()==BND_CND_APRD) coeff = -coeff ;
          } else {
            v1_plus_mu = (Float *)v1+vec_plus_mu_offset ;
            v2_plus_mu = (Float *)v2+vec_plus_mu_offset ;
            vec_plus_mu_stride = f_size_4d - f_site_size_4d ;
          }
          break ;
        case 1 :
          vec_plus_mu_offset *= x+lx*((y+1)%ly+ly*(z+lz*t)) ;
          if ((y+1) == ly) {
            for (s=0; s<ls; s++) {
              getPlusData( (IFloat *)site_v1+s*f_site_size_4d,
                (IFloat *)v1+vec_plus_mu_offset+s*f_size_4d,
                f_site_size_4d, mu) ;
              getPlusData( (IFloat *)site_v2+s*f_site_size_4d,
                (IFloat *)v2+vec_plus_mu_offset+s*f_size_4d,
                f_site_size_4d, mu) ;
            } // end for s
            v1_plus_mu = site_v1 ;
            v2_plus_mu = site_v2 ;
            vec_plus_mu_stride = 0 ;
            if (GJP.YnodeBc()==BND_CND_APRD) coeff = -coeff ;
          } else {
            v1_plus_mu = (Float *)v1+vec_plus_mu_offset ;
            v2_plus_mu = (Float *)v2+vec_plus_mu_offset ;
            vec_plus_mu_stride = f_size_4d - f_site_size_4d ;
          }
          break ;
        case 2 :
          vec_plus_mu_offset *= x+lx*(y+ly*((z+1)%lz+lz*t)) ;
          if ((z+1) == lz) {
            for (s=0; s<ls; s++) {
              getPlusData( (IFloat *)site_v1+s*f_site_size_4d,
                (IFloat *)v1+vec_plus_mu_offset+s*f_size_4d,
                f_site_size_4d, mu) ;
              getPlusData( (IFloat *)site_v2+s*f_site_size_4d,
                (IFloat *)v2+vec_plus_mu_offset+s*f_size_4d,
                f_site_size_4d, mu) ;
            } // end for s
            v1_plus_mu = site_v1 ;
            v2_plus_mu = site_v2 ;
            vec_plus_mu_stride = 0 ;
            if (GJP.ZnodeBc()==BND_CND_APRD) coeff = -coeff ;
          } else {
            v1_plus_mu = (Float *)v1+vec_plus_mu_offset ;
            v2_plus_mu = (Float *)v2+vec_plus_mu_offset ;
            vec_plus_mu_stride = f_size_4d - f_site_size_4d ;
          }
          break ;
        case 3 :
          vec_plus_mu_offset *= x+lx*(y+ly*(z+lz*((t+1)%lt))) ;
          if ((t+1) == lt) {
            for (s=0; s<ls; s++) {
              getPlusData( (IFloat *)site_v1+s*f_site_size_4d,
                (IFloat *)v1+vec_plus_mu_offset+s*f_size_4d,
                f_site_size_4d, mu) ;
              getPlusData( (IFloat *)site_v2+s*f_site_size_4d,
                (IFloat *)v2+vec_plus_mu_offset+s*f_size_4d,
                f_site_size_4d, mu) ;
            } // end for s
            v1_plus_mu = site_v1 ;
            v2_plus_mu = site_v2 ;
            vec_plus_mu_stride = 0 ;
            if (GJP.TnodeBc()==BND_CND_APRD) coeff = -coeff ;
          } else {
            v1_plus_mu = (Float *)v1+vec_plus_mu_offset ;
            v2_plus_mu = (Float *)v2+vec_plus_mu_offset ;
            vec_plus_mu_stride = f_size_4d - f_site_size_4d ;
          }
      } // end switch mu 

      sproj_tr[mu]( (IFloat *)&tmp_mat1,
                    (IFloat *)v1_plus_mu,
                    (IFloat *)v2+vec_offset,
                    ls, vec_plus_mu_stride, f_size_4d-f_site_size_4d) ;

      sproj_tr[mu+4]( (IFloat *)&tmp_mat2,
                      (IFloat *)v2_plus_mu,
                      (IFloat *)v1+vec_offset,
                      ls, vec_plus_mu_stride, f_size_4d-f_site_size_4d) ;

      tmp_mat1 += tmp_mat2 ;

      // If GJP.Snodes > 1 sum up contributions from all s nodes
      if(GJP.Snodes() > 1) {
//        if(!UniqueID())printf("%s::%s:GJP.Snodes()=%d\n",cname,fname,GJP.Snodes()); 
	glb_sum_multi_dir((Float *)&tmp_mat1,4,sizeof(Matrix)/sizeof(IFloat));
      }

      tmp_mat2.DotMEqual(*(gauge+gauge_offset), tmp_mat1) ;

      tmp_mat1.Dagger(tmp_mat2) ;

      tmp_mat2.TrLessAntiHermMatrix(tmp_mat1) ;

      tmp_mat2 *= coeff ;

      *(mom+gauge_offset) += tmp_mat2 ;
      Float norm = tmp_mat2.norm();
      Float tmp = sqrt(norm);
      L1 += tmp;
      L2 += norm;
      Linf = (tmp>Linf ? tmp : Linf);

    } } } } // end for x,y,z,t
  } // end for mu
  ForceFlops += (2*9*16*ls + 18+ 198+36+24)*lx*ly*lz*lt*4;
#ifdef PROFILE
  time += dclock();
  print_flops(cname,fname,ForceFlops,time);
#endif
 
//------------------------------------------------------------------
// deallocate smalloc'd space
//------------------------------------------------------------------
  VRB.Sfree(cname, fname, str_site_v2, site_v2) ;
  sfree(site_v2) ;
 
  VRB.Sfree(cname, fname, str_site_v1, site_v1) ;
  sfree(site_v1) ;
 
  VRB.Sfree(cname, fname, str_v2, v2) ;
  sfree(v2) ;
 
  VRB.Sfree(cname, fname, str_v1, v1) ;
  sfree(v1) ;
 
  glb_sum(&L1);
  glb_sum(&L2);
  glb_max(&Linf);

  L1 /= 4.0*GJP.VolSites();
  L2 /= 4.0*GJP.VolSites();

  VRB.FuncEnd(cname,fname);
  return ForceArg(L1, sqrt(L2), Linf);

}
コード例 #15
0
ファイル: main.C プロジェクト: DeanHowarth/QUDA-CPS
int main(int argc,char *argv[]){


#if TARGET == QCDOC
    DefaultSetup();
    printf("Sizes = %d %d %d %d %d %d\n",SizeX(),SizeY(),SizeZ(),SizeT(),SizeS(),SizeW());
    printf("Coors = %d %d %d %d %d %d\n",CoorX(),CoorY(),CoorZ(),CoorT(),CoorS(),CoorW());
#endif
    FILE *fp;
    double dtime;

    //----------------------------------------------------------------
    // Initializes all Global Job Parameters
    //----------------------------------------------------------------
    DoArg do_arg;
    int nx,ny,nz,nt;

    if (argc < 5){
        ERR.General("f_stag_test","main()","usage: %s nx ny nz nt\n",argv[0]);
    }
    sscanf(argv[1],"%d",&nx);
    sscanf(argv[2],"%d",&ny);
    sscanf(argv[3],"%d",&nz);
    sscanf(argv[4],"%d",&nt);
    printf("total sites = %d %d %d %d\n",nx,ny,nz,nt);
#if TARGET == QCDOC
    do_arg.x_node_sites = nx/SizeX();
    do_arg.y_node_sites = ny/SizeY();
    do_arg.z_node_sites = nz/SizeZ();
    do_arg.t_node_sites = nt/SizeT();
    do_arg.s_node_sites = 1;
    do_arg.x_nodes = SizeX();
    do_arg.y_nodes = SizeY();
    do_arg.z_nodes = SizeZ();
    do_arg.t_nodes = SizeT();
    do_arg.s_nodes = 1;
#else
    do_arg.x_node_sites = nx;
    do_arg.y_node_sites = ny;
    do_arg.z_node_sites = nz;
    do_arg.t_node_sites = nt;
    do_arg.s_node_sites = 0;
    do_arg.x_nodes = 1;
    do_arg.y_nodes = 1;
    do_arg.z_nodes = 1;
    do_arg.t_nodes = 1;
    do_arg.s_nodes = 1;
#endif 
    do_arg.x_bc = BND_CND_PRD;
    do_arg.y_bc = BND_CND_PRD;
    do_arg.z_bc = BND_CND_PRD;
    do_arg.t_bc = BND_CND_APRD;

    do_arg.start_conf_kind = START_CONF_DISORD;

    do_arg.start_seed_kind = START_SEED_FIXED;
//    do_arg.colors = 3;
    do_arg.beta = 5.5;
    do_arg.dwf_height = 0.9;
    do_arg.clover_coeff = 2.0171;
//    do_arg.verbose_level = -1205;

    CgArg cg_arg;

    cg_arg.mass = 0.1;
    cg_arg.stop_rsd = 1e-12;
    cg_arg.max_num_iter = 500;
    GJP.Initialize(do_arg);

//   VRB.Level(GJP.VerboseLevel());
	VRB.Level(0);
	VRB.ActivateLevel(VERBOSE_FUNC_LEVEL);
	VRB.ActivateLevel(VERBOSE_FLOW_LEVEL);
	VRB.ActivateLevel(VERBOSE_RNGSEED_LEVEL);

#if TARGET == QCDOC
    char filename [200];
    sprintf(filename,"%s%d%d%d%d%d%d_%d%d%d%d%d%d.out",f_stag_test_filename,SizeX(),SizeY(),SizeZ(),SizeT(),SizeS(),SizeW(),CoorX(),CoorY(),CoorZ(),CoorT(),CoorS(),CoorW());
   fp = Fopen(filename,"w");
#else
    fp = Fopen("f_stag_test.out","w");
#endif

    GwilsonFstag lat;

    Vector *result = 
	(Vector*)smalloc(GJP.VolNodeSites()*lat.FsiteSize()*sizeof(IFloat));
    Vector *X_out =
	(Vector*)smalloc(GJP.VolNodeSites()*lat.FsiteSize()*sizeof(IFloat));
    Vector *X_out2 =
	(Vector*)smalloc(GJP.VolNodeSites()*lat.FsiteSize()*sizeof(IFloat));

    if(!result) ERR.Pointer("","","result");
    if(!X_out) ERR.Pointer("","","X_out");
    if(!X_out2) ERR.Pointer("","","X_out2");
    Vector *X_out_odd = &(X_out[GJP.VolNodeSites()/2]);

    int s[4];
    Vector *X_in =
	(Vector*)smalloc(GJP.VolNodeSites()*lat.FsiteSize()*sizeof(IFloat));
    if(!X_in) ERR.Pointer("","","X_in");
#if 1
	lat.RandGaussVector(X_in,1.0);
#else
    Vector *X_in_odd = &(X_in[GJP.VolNodeSites()/2]);

    Matrix *gf = lat.GaugeField();
    IFloat *gf_p = (IFloat *)lat.GaugeField();

    for(s[3]=0; s[3]<GJP.NodeSites(3); s[3]++)
	for(s[2]=0; s[2]<GJP.NodeSites(2); s[2]++)
	    for(s[1]=0; s[1]<GJP.NodeSites(1); s[1]++)
		for(s[0]=0; s[0]<GJP.NodeSites(0); s[0]++) {

		    int n = lat.FsiteOffset(s);
			IFloat *temp_p = (IFloat *)(gf+4*n+3);

		    IFloat crd = 1.0*s[0]+0.1*s[1]+0.01*s[2]+0.001*s[3];
#if TARGET==QCDOC
		  if(CoorX()==0 && CoorY()==0 && CoorZ()==0 && CoorT()==0 &&n==0) crd=1.0; else crd = 0.0;
#else
	if(n==0) crd = 1.0; else crd = 0.0;
#endif
					
		    for(int v=0; v<6; v+=2){ 
			if (v==0)
			*((IFloat*)&X_in[n]+v) = crd;
			else
			*((IFloat*)&X_in[n]+v) = 0;
			*((IFloat*)&X_in[n]+v+1) = 0.0;
		    }
		}
#endif

    Vector *out;
    DiracOpStag dirac(lat,X_out,X_in,&cg_arg,CNV_FRM_NO);

	for(int k = 0; k< 1; k++){
		printf("k=%d ",k);
		if (k ==0)
			out = result;
		else
			out = X_out;
		bzero((char *)out, GJP.VolNodeSites()*lat.FsiteSize()*sizeof(IFloat));
		lat.Fconvert(out,STAG,CANONICAL);
		lat.Fconvert(X_in,STAG,CANONICAL);
		int offset = GJP.VolNodeSites()*lat.FsiteSize()/ (2*6);
#if 1
#if TARGET==QCDOC
		int vol = nx*ny*nz*nt/(SizeX()*SizeY()*SizeZ()*SizeT());
#else
		int vol = nx*ny*nz*nt;
#endif
		dtime = -dclock();
   		int iter = dirac.MatInv(out,X_in);
		dtime +=dclock();
		print_flops(606*iter*vol,dtime);
		printf("iter=%d\n",iter);
#else
		dirac.Dslash(out,X_in+offset,CHKB_ODD,DAG_NO);
		dirac.Dslash(out+offset,X_in,CHKB_EVEN,DAG_NO);
#endif

		if (k == 0){
			bzero((char *)X_out2, GJP.VolNodeSites()*lat.FsiteSize()*sizeof(IFloat));
			dirac.Dslash(X_out2,out+offset,CHKB_ODD,DAG_NO);
			dirac.Dslash(X_out2+offset,out,CHKB_EVEN,DAG_NO);
			lat.Fconvert(X_out2,CANONICAL,STAG);
		}
		lat.Fconvert(out,CANONICAL,STAG);
		lat.Fconvert(X_in,CANONICAL,STAG);
		X_out2->FTimesV1PlusV2(2*cg_arg.mass,out,X_out2,GJP.VolNodeSites
()*lat.FsiteSize());
    
    Float dummy;
    Float dt = 2;

    
    for(s[3]=0; s[3]<GJP.NodeSites(3); s[3]++) 
	for(s[2]=0; s[2]<GJP.NodeSites(2); s[2]++)
	    for(s[1]=0; s[1]<GJP.NodeSites(1); s[1]++)
		for(s[0]=0; s[0]<GJP.NodeSites(0); s[0]++) {

		    int n = lat.FsiteOffset(s);
			for(int i=0; i<3; i++){
#if TARGET == QCDOC
		    if ( k==0 )
				Fprintf(fp," %d %d %d %d %d ", CoorX()*GJP.NodeSites(0)+s[0], CoorY()*GJP.NodeSites(1)+s[1], CoorZ()*GJP.NodeSites(2)+s[2], CoorT()*GJP.NodeSites(3)+s[3], i);
#else
		    if ( k==0 )
				Fprintf(fp," %d %d %d %d %d ", s[0], s[1], s[2], s[3], i);
#endif
		    if ( k==0 )
				Fprintf(fp," (%0.7e %0.7e) (%0.7e %0.7e)",
				*((IFloat*)&result[n]+i*2), *((IFloat*)&result[n]+i*2+1),
				*((IFloat*)&X_in[n]+i*2), *((IFloat*)&X_in[n]+i*2+1));
#if 1
				Fprintf(fp,"\n");
#else
				Fprintf(fp," (%0.2e %0.2e)\n",
				*((IFloat*)&X_out2[n]+i*2)-*((IFloat*)&X_in[n]+i*2), *((IFloat*)&X_out2[n]+i*2+1)-*((IFloat*)&X_in[n]+i* 2+1));
#endif
			}
		}
}
    Fclose(fp);
    
    sfree(X_in);
    sfree(result);
    sfree(X_out);
    sfree(X_out2);
    return 0; 
}
コード例 #16
0
ファイル: p4_Fforce.C プロジェクト: DeanHowarth/QUDA-CPS
ForceArg Fp4::EvolveMomFforce(Matrix *mom, Vector *frm, Float mass, Float dt){
  char *fname = "EvolveMomFforce(M*,V*,F,F,F)";
  ERR.NotImplemented(cname,fname);
  ForceArg  Fdt;

#if 0
  VRB.Func(cname,fname);

#ifdef PROFILE
  Float dtime;
  ParTrans::PTflops=0;
  ForceFlops=0;
#endif
  size_t size;
  //	int nflops=0;
  static int vax_len = 0;
  if (vax_len == 0)
    vax_len = GJP.VolNodeSites()*VECT_LEN/VAXPY_UNROLL;

  size = GJP.VolNodeSites()/2*FsiteSize()*sizeof(Float);
  Vector *X = (Vector *)smalloc(2*size);
  //    printf("X=%p\n",X);
  Vector *X_e = X;                             // even sites
  Vector *X_o = X+GJP.VolNodeSites()/2;  // odd sites

  // The argument frm should have the CG solution.
  // The FstagTypes protected pointer f_tmp should contain Dslash frm

  moveMem(X_e, frm, size);
#ifdef DEBUGGING
  f_tmp = frm+GJP.VolNodeSites()/2; // debugging only
#endif
  moveMem(X_o, f_tmp, size);
  Fconvert(X, CANONICAL, STAG);

  Convert(STAG);  // Puts staggered phases into gauge field.
    
    
  int N; // N can be 1, 2 or 4.
  N = 4;
  if (GJP.VolNodeSites()>256)
    N = 2;  
  else if (GJP.VolNodeSites()>512)
    N = 1;
  VRB.Flow(cname,fname,"N=%d\n",N);

  enum{plus=0, minus=1, n_sign=2};

  // Array in which to accumulate the force term:
  // this must be initialised to zero 
#if 0
  Matrix **force = (Matrix**)amalloc(sizeof(Matrix), 2, 4, GJP.VolNodeSites());
  if(!force) ERR.Pointer(cname, fname, "force");
#else
  size = GJP.VolNodeSites()*sizeof(Matrix);
  Matrix *force[4];
  for(int i = 0;i<4;i++)
    force[i] = (Matrix *)v_alloc("force[i]",size);
#endif
  for(int i=0; i<4; i++)
    for(int s=0; s<GJP.VolNodeSites(); s++) force[i][s].ZeroMatrix();
  ParTransAsqtad parallel_transport(*this);


  // Vector arrays for which we must allocate memory

#if 0
  Vector ***Pnu = (Vector***)amalloc(sizeof(Vector), 3, n_sign, N, GJP.VolNodeSites());
  if(!Pnu) ERR.Pointer(cname, fname, "Pnu");
    
  Vector ****P3 = (Vector****)amalloc(sizeof(Vector), 4, n_sign, n_sign, N, GJP.VolNodeSites());
  if(!P3) ERR.Pointer(cname, fname, "P3");
  Vector ****Prhonu = (Vector****)amalloc(sizeof(Vector), 4, n_sign, n_sign, N, GJP.VolNodeSites());
  if(!Prhonu) ERR.Pointer(cname, fname, "Prhonu");
  Vector *****P5 = (Vector*****)amalloc(sizeof(Vector), 5, n_sign, n_sign, n_sign, N, GJP.VolNodeSites());
  if(!P5) ERR.Pointer(cname, fname, "P5");
  Vector ******P7 = (Vector******)amalloc(sizeof(Vector), 6, n_sign, n_sign, n_sign, n_sign, N, GJP.VolNodeSites());
  if(!P7) ERR.Pointer(cname, fname, "P7");
  Vector ******Psigma7 = (Vector******)amalloc(sizeof(Vector), 6, n_sign, n_sign, n_sign, n_sign, N, GJP.VolNodeSites());
  if(!Psigma7) ERR.Pointer(cname, fname, "Psigma7");

    
  // These vectors can be overlapped with previously allocated memory
    
  Vector **Pnununu = Prhonu[0][0];
  Vector ***Pnunu = Psigma7[0][0][0];;
  Vector ****Pnu5 = P7[0][0];
  Vector ****Pnu3 = P7[0][0];
  Vector *****Prho5 = Psigma7[0];
  Vector *****Psigmarhonu = Psigma7[0];
#else
  size = GJP.VolNodeSites()*sizeof(Vector);
  Vector *Pnu[n_sign][N];
  Vector *P3[n_sign][n_sign][N];
  Vector *Prhonu[n_sign][n_sign][N];
  Vector *P5[n_sign][n_sign][n_sign][N];
  Vector *P7[n_sign][n_sign][n_sign][n_sign][N];
  Vector *Psigma7[n_sign][n_sign][n_sign][n_sign][N];
  Vector *Pnununu[N];
  Vector *Pnunu[n_sign][N];
  Vector *Pnu5[n_sign][n_sign][N];
  Vector *Pnu3[n_sign][n_sign][N];
  Vector *Prho5[n_sign][n_sign][n_sign][N];
  Vector *Psigmarhonu[n_sign][n_sign][n_sign][N];
  //printf("Pnu=%p Psigmarhonu=%p\n",Pnu,Psigmarhonu);

  for(int w = 0;w<N;w++){
    for(int i = 0;i<n_sign;i++){
      Pnu[i][w]= (Vector *)v_alloc("Pnu",size);
      for(int j = 0;j<n_sign;j++){
	P3[i][j][w]= (Vector *)v_alloc("P3",size);
	Prhonu[i][j][w]= (Vector *)v_alloc("Prhonu",size);
	for(int k = 0;k<n_sign;k++){
	  P5[i][j][k][w]= (Vector *)v_alloc("P5",size);
	  for(int l = 0;l<n_sign;l++){
	    P7[i][j][k][l][w]= (Vector *)v_alloc("P7",size);
	    Psigma7[i][j][k][l][w]= (Vector *)v_alloc("Psigma7",size);
	  }
	  Prho5[i][j][k][w] = Psigma7[0][i][j][k][w];
	  Psigmarhonu[i][j][k][w] = Psigma7[0][i][j][k][w];
	}
	Pnu5[i][j][w]=P7[0][0][i][j][w];
	Pnu3[i][j][w]=P7[0][0][i][j][w];
      }
      Pnunu[i][w]=Psigma7[0][0][0][i][w];
    }
    Pnununu[w]=Prhonu[0][0][w];
  }

#endif
    


  // input/output arrays for the parallel transport routines
  Vector *vin[n_sign*N], *vout[n_sign*N];
  int dir[n_sign*N];
	
  int mu[N], nu[N], rho[N], sigma[N];   // Sets of directions
  int w;                                // The direction index 0...N-1
  int ms, ns, rs, ss;                   // Sign of direction
  bool done[4] = {false,false,false,false};  // Flags to tell us which 
  // nu directions we have done.
	    
#ifdef PROFILE
  dtime = -dclock();
#endif
  for (int m=0; m<4; m+=N){                     	    // Loop over mu
    for(w=0; w<N; w++) mu[w] = (m+w)%4; 

    for (int n=m+1; n<m+4; n++){                        // Loop over nu
      for(w=0; w<N; w++) nu[w] = (n+w)%4;

      // Pnu = U_nu X

      for(int i=0; i<N; i++){
	vin[i] = vin[i+N] = X;
	dir[n_sign*i] = n_sign*nu[i]+plus;        // nu_i
	dir[n_sign*i+1] = n_sign*nu[i]+minus;    // -nu_i
	vout[n_sign*i] = Pnu[minus][i];
	vout[n_sign*i+1] = Pnu[plus][i];
      }
      parallel_transport.run(n_sign*N, vout, vin, dir);

      // P3 = U_mu Pnu
      // ms is the nu sign index, ms is the mu sign index,
      // w is the direction index
      for(int i=0; i<N; i++){
	dir[n_sign*i] = n_sign*mu[i]+plus;        // mu_i
	dir[n_sign*i+1] = n_sign*mu[i]+minus;    // -mu_i
      }
      for(ns=0; ns<n_sign; ns++){               // ns is the sign of nu
	for(int i=0; i<N; i++){
	  vin[n_sign*i] = vin[n_sign*i+1] = Pnu[ns][i];
	  vout[n_sign*i] = P3[plus][ns][i];
	  vout[n_sign*i+1] = P3[minus][ns][i];
	}
	parallel_transport.run(n_sign*N, vout, vin, dir);
      }
	    
      for(w=0; w<N; w++)
	for(ns=0; ns<n_sign; ns++){
	  force_product_sum(P3[plus][ns][w], Pnu[ns][w],
			    GJP.staple3_coeff(),
			    force[mu[w]]);
	}

      for(int r=n+1; r<n+4; r++){                     // Loop over rho
	bool nextr = false;
	for(w=0; w<N; w++){
	  rho[w] = (r+w)%4;		
	  if(rho[w]==mu[w]){
	    nextr = true;
	    break;
	  }
	}
	if(nextr) continue;

	for(w=0; w<N; w++){                         // sigma
	  for(int s=rho[w]+1; s<rho[w]+4; s++){
	    sigma[w] = s%4;
	    if(sigma[w]!=mu[w] && sigma[w]!=nu[w]) break;
	  }
	}

	// Prhonu = U_rho Pnu 

	for(int i=0; i<N; i++){
	  dir[n_sign*i] = n_sign*rho[i]+plus;        
	  dir[n_sign*i+1] = n_sign*rho[i]+minus;    
	}
	for(ns=0; ns<n_sign; ns++){
	  for(int i=0; i<N; i++){
	    vin[n_sign*i] = vin[n_sign*i+1] = Pnu[ns][i];
	    vout[n_sign*i] = Prhonu[ns][minus][i];
	    vout[n_sign*i+1] = Prhonu[ns][plus][i];
	  }
	  parallel_transport.run(n_sign*N, vout, vin, dir);
	}

	// P5 = U_mu Prhonu

	for(int i=0; i<N; i++){
	  dir[n_sign*i] = n_sign*mu[i]+plus;        
	  dir[n_sign*i+1] = n_sign*mu[i]+minus;    
	}
	for(ns=0; ns<n_sign; ns++) for(rs=0; rs<n_sign; rs++) {
	  for(int i=0; i<N; i++){
	    vin[n_sign*i] = vin[n_sign*i+1] = Prhonu[ns][rs][i];
	    vout[n_sign*i] = P5[plus][ns][rs][i];
	    vout[n_sign*i+1] = P5[minus][ns][rs][i];
	  }
	  parallel_transport.run(n_sign*N, vout, vin, dir);
	}

	// F_mu += P5 Prhonu^dagger
		      
	for(w=0; w<N; w++)
	  for(ns=0; ns<n_sign; ns++) for(rs=0; rs<n_sign; rs++)
	    force_product_sum(P5[plus][ns][rs][w],
			      Prhonu[ns][rs][w],
			      GJP.staple5_coeff(),
			      force[mu[w]]);

	// Psigmarhonu = U_sigma P_rhonu
		
	for(int i=0; i<N; i++){
	  dir[n_sign*i] = (n_sign*sigma[i]);        
	  dir[n_sign*i+1] = (n_sign*sigma[i]+1);    
	}
	for(ns=0; ns<n_sign; ns++) for(rs=0; rs<n_sign; rs++){
	  for(int i=0; i<N; i++){
	    vin[n_sign*i] = vin[n_sign*i+1] = Prhonu[ns][rs][i];
	    vout[n_sign*i] = Psigmarhonu[ns][rs][minus][i];
	    vout[n_sign*i+1] = Psigmarhonu[ns][rs][plus][i];
	  }
	  parallel_transport.run(n_sign*N, vout, vin, dir);
	}

	// P7 = U_mu P_sigmarhonu
	for(int i=0; i<N; i++){
	  dir[n_sign*i] = n_sign*mu[i]+plus;        
	  dir[n_sign*i+1] = n_sign*mu[i]+minus;    
	}
	for(ns=0; ns<n_sign; ns++) for(rs=0; rs<n_sign; rs++) for(ss=0; ss<n_sign; ss++){
	  for(int i=0; i<N; i++){
	    vin[n_sign*i] = vin[n_sign*i+1] = Psigmarhonu[ns][rs][ss][i];
	    vout[n_sign*i] = P7[plus][ns][rs][ss][i];
	    vout[n_sign*i+1] = P7[minus][ns][rs][ss][i];
	  }
	  parallel_transport.run(n_sign*N, vout, vin, dir);
	}

	// F_mu -= P7 Psigmarhonu^\dagger
		
	for(w=0; w<N; w++)
	  for(ns=0; ns<n_sign; ns++) for(rs=0; rs<n_sign; rs++) for(ss=0; ss<n_sign; ss++)
	    force_product_sum(P7[plus][ns][rs][ss][w],
			      Psigmarhonu[ns][rs][ss][w],
			      GJP.staple7_coeff(),
			      force[mu[w]]);

	// F_sigma += P7 Psigmarhonu^\dagger
	// N.B. this is the same as one of the previous products.
		
	for(w=0; w<N; w++)
	  for(ns=0; ns<n_sign; ns++) for(rs=0; rs<n_sign; rs++) 
	    force_product_sum(P7[plus][ns][rs][minus][w],
			      Psigmarhonu[ns][rs][minus][w],
			      -GJP.staple7_coeff(),
			      force[sigma[w]]);

	// F_sigma += Psigmarhonu P7^\dagger
		
	for(w=0; w<N; w++)
	  for(ns=0; ns<n_sign; ns++) for(rs=0; rs<n_sign; rs++) 
	    force_product_sum(Psigmarhonu[ns][rs][minus][w],
			      P7[minus][ns][rs][minus][w],
			      -GJP.staple7_coeff(),
			      force[sigma[w]]);

	// Psigma7 = U_sigma P7 
	for(int i=0; i<N; i++){
	  dir[n_sign*i] = (n_sign*sigma[i]);        
	  dir[n_sign*i+1] = (n_sign*sigma[i]+1);    
	}
	for(ms=0; ms<n_sign; ms++) for(ns=0; ns<n_sign; ns++) for(rs=0; rs<n_sign; rs++){
	  for(int i=0; i<N; i++){
	    vin[n_sign*i] = P7[ms][ns][rs][plus][i];
	    vin[n_sign*i+1] = P7[ms][ns][rs][minus][i];
	    vout[n_sign*i] = Psigma7[ms][ns][rs][plus][i];
	    vout[n_sign*i+1] = Psigma7[ms][ns][rs][minus][i];
	  }
	  parallel_transport.run(n_sign*N, vout, vin, dir);
	}

	// F_sigma += Fsigma7 Frhonu^\dagger

	for(w=0; w<N; w++)
	  for(ns=0; ns<n_sign; ns++) for(rs=0; rs<n_sign; rs++) 
	    force_product_sum(Psigma7[plus][ns][rs][plus][w],
			      Prhonu[ns][rs][w],
			      -GJP.staple7_coeff(),
			      force[sigma[w]]);

	// F_sigma += Frhonu Fsigma7^\dagger

	for(w=0; w<N; w++)
	  for(ns=0; ns<n_sign; ns++) for(rs=0; rs<n_sign; rs++) 
	    force_product_sum(Prhonu[ns][rs][w],
			      Psigma7[minus][ns][rs][plus][w],
			      -GJP.staple7_coeff(),
			      force[sigma[w]]);

	// P5 += c_7/c_5 Psigma7

	if(GJP.staple5_coeff()!=0.0){
	  Float c75 = -GJP.staple7_coeff()/GJP.staple5_coeff();
	  for(ms=0; ms<n_sign; ms++) 
	    for(ns=0; ns<n_sign; ns++) 
	      for(rs=0; rs<n_sign; rs++) 
		for(ss=0; ss<n_sign; ss++) 
		  for(w=0; w<N; w++)
		    vaxpy3(P5[ms][ns][rs][w],&c75, Psigma7[ms][ns][rs][ss][w], P5[ms][ns][rs][w], vax_len);
	  //			P5[ms][ns][rs][w]->FTimesV1PlusV2(-GJP.staple7_coeff()/GJP.staple5_coeff(), Psigma7[ms][ns][rs][ss][w], P5[ms][ns][rs][w], GJP.VolNodeSites()*VECT_LEN);
	  ForceFlops += 2*GJP.VolNodeSites()*VECT_LEN*N*n_sign*n_sign*n_sign*n_sign;
	}

	// F_rho -= P5 Prhonu^\dagger
	for(w=0; w<N; w++)
	  for(ns=0; ns<n_sign; ns++)
	    force_product_sum(P5[plus][ns][minus][w],
			      Prhonu[ns][minus][w],
			      -GJP.staple5_coeff(),
			      force[rho[w]]);

	// F_rho -= Prhonu P5^\dagger
		    
	for(w=0; w<N; w++)
	  for(ns=0; ns<n_sign; ns++)
	    force_product_sum(Prhonu[ns][minus][w],
			      P5[minus][ns][minus][w],
			      -GJP.staple5_coeff(),
			      force[rho[w]]);

	// Prho5 = U_rho P5

	for(int i=0; i<N; i++){
	  dir[n_sign*i] = n_sign*rho[i]+plus;        
	  dir[n_sign*i+1] = n_sign*rho[i]+minus;    
	}
	for(ms=0; ms<n_sign; ms++) for(ns=0; ns<n_sign; ns++){
	  for(int i=0; i<N; i++){
	    vin[n_sign*i] = P5[ms][ns][plus][i];
	    vin[n_sign*i+1] = P5[ms][ns][minus][i];
	    vout[n_sign*i] = Prho5[ms][ns][plus][i];
	    vout[n_sign*i+1] = Prho5[ms][ns][minus][i];
	  }
	  parallel_transport.run(n_sign*N, vout, vin, dir);
	}

	// F_rho -= Prho5 Pnu^\dagger
		
	for(w=0; w<N; w++)
	  for(ns=0; ns<n_sign; ns++)
	    force_product_sum(Prho5[plus][ns][plus][w],
			      Pnu[ns][w],
			      -GJP.staple5_coeff(),
			      force[rho[w]]);

	// F_rho -= Pnu Prho5^\dagger
		
	for(w=0; w<N; w++)
	  for(ns=0; ns<n_sign; ns++)
	    force_product_sum(Pnu[ns][w],
			      Prho5[minus][ns][plus][w],
			      -GJP.staple5_coeff(),
			      force[rho[w]]);
		
	// P3 += c_5/c_3 Prho5

	if(GJP.staple3_coeff()!=0.0){		
	  Float c53 = -GJP.staple5_coeff()/GJP.staple3_coeff();
	  for(ms=0; ms<n_sign; ms++) 
	    for(ns=0; ns<n_sign; ns++) 
	      for(rs=0; rs<n_sign; rs++) 
		for(w=0; w<N; w++)
		  vaxpy3(P3[ms][ns][w],&c53,Prho5[ms][ns][rs][w], P3[ms][ns][w], vax_len);
	  //			P3[ms][ns][w]->FTimesV1PlusV2(-GJP.staple5_coeff()/GJP.staple3_coeff(), Prho5[ms][ns][rs][w], P3[ms][ns][w], GJP.VolNodeSites()*VECT_LEN);
	  ForceFlops += 2*GJP.VolNodeSites()*VECT_LEN*N*n_sign*n_sign*n_sign;
	}

      } // rho+sigma loop

      // Pnunu = U_nu Pnu

      for(int i=0; i<N; i++){
	dir[n_sign*i] = n_sign*nu[i]+plus;        
	dir[n_sign*i+1] = n_sign*nu[i]+minus;    
      }
      for(int i=0; i<N; i++){
	vin[n_sign*i] = Pnu[minus][i];
	vin[n_sign*i+1] = Pnu[plus][i];
	vout[n_sign*i] = Pnunu[minus][i];
	vout[n_sign*i+1] = Pnunu[plus][i];
      }
      parallel_transport.run(n_sign*N, vout, vin, dir);

      // P5 = U_mu Pnunu

      for(int i=0; i<N; i++){
	dir[n_sign*i] = n_sign*mu[i]+plus;        
	dir[n_sign*i+1] = n_sign*mu[i]+minus;    
      }
      for(ns=0; ns<n_sign; ns++){
	for(int i=0; i<N; i++){
	  vin[n_sign*i] = Pnunu[ns][i];
	  vin[n_sign*i+1] = Pnunu[ns][i];
	  vout[n_sign*i] = P5[plus][ns][0][i];
	  vout[n_sign*i+1] = P5[minus][ns][0][i];
	}
	parallel_transport.run(n_sign*N, vout, vin, dir);
      }

      // F_mu += P5 Pnunu^\dagger

      for(w=0; w<N; w++)
	for(ns=0; ns<n_sign; ns++)
	  force_product_sum(P5[plus][ns][0][w],
			    Pnunu[ns][w],
			    GJP.Lepage_coeff(),
			    force[mu[w]]);

      // F_nu -= P5 Pnunu^\dagger
      // N.B. this is the same as one of the previous products
	    
      for(w=0; w<N; w++)
	force_product_sum(P5[plus][minus][0][w],
			  Pnunu[minus][w],
			  -GJP.Lepage_coeff(),
			  force[nu[w]]);
	    
      // F_nu -= Pnunu P5^\dagger
	    
      for(w=0; w<N; w++)
	force_product_sum(Pnunu[minus][w],
			  P5[minus][minus][0][w],
			  -GJP.Lepage_coeff(),
			  force[nu[w]]);

      // Pnu5 = U_nu P5

      for(int i=0; i<N; i++){
	dir[n_sign*i] = n_sign*nu[i]+plus;        
	dir[n_sign*i+1] = n_sign*nu[i]+minus;    
      }
      for(ms=0; ms<n_sign; ms++){
	for(int i=0; i<N; i++){
	  vin[n_sign*i] =   P5[ms][plus][0][i]; 
	  vin[n_sign*i+1] = P5[ms][minus][0][i];
	  vout[n_sign*i] =   Pnu5[ms][plus][i];
	  vout[n_sign*i+1] = Pnu5[ms][minus][i];
	}
	parallel_transport.run(n_sign*N, vout, vin, dir);
      }

      // F_nu -= Pnu5 Pnu^\dagger

      for(w=0; w<N; w++)
	force_product_sum(Pnu5[plus][plus][w],
			  Pnu[plus][w],
			  -GJP.Lepage_coeff(),
			  force[nu[w]]);

      // F_nu -= Pnu Pnu5^\dagger

      for(w=0; w<N; w++)
	force_product_sum(Pnu[plus][w],
			  Pnu5[minus][plus][w],
			  -GJP.Lepage_coeff(),
			  force[nu[w]]);

      // P3 += c_L/c_3 Pnu5

      if(GJP.staple3_coeff()!=0.0){
	Float cl3 = -GJP.Lepage_coeff()/GJP.staple3_coeff();
	for(ms=0; ms<n_sign; ms++) 
	  for(ns=0; ns<n_sign; ns++) 
	    for(w=0; w<N; w++)
	      vaxpy3(P3[ms][ns][w],&cl3,Pnu5[ms][ns][w],P3[ms][ns][w], vax_len);
	//		   		P3[ms][ns][w]->FTimesV1PlusV2(-GJP.Lepage_coeff()/GJP.staple3_coeff(), Pnu5[ms][ns][w], P3[ms][ns][w], GJP.VolNodeSites()*VECT_LEN);
	ForceFlops += 2*GJP.VolNodeSites()*VECT_LEN*N*n_sign*n_sign;
      }

      // F_nu += P3 Pnu^\dagger

      for(w=0; w<N; w++)
	force_product_sum(P3[plus][minus][w],
			  Pnu[minus][w],
			  -GJP.staple3_coeff(),
			  force[nu[w]]);

      // F_nu +=  Pnu P3^\dagger

      for(w=0; w<N; w++)
	force_product_sum(Pnu[minus][w],
			  P3[minus][minus][w],
			  -GJP.staple3_coeff(),
			  force[nu[w]]);
	    
      // Pnu3 = U_nu P3

      for(int i=0; i<N; i++)
	dir[i] = n_sign*nu[i]+plus;        
      for(ms=0; ms<n_sign; ms++){
	for(int i=0; i<N; i++){
	  vin[i] = P3[ms][plus][i]; 
	  vout[i] = Pnu3[ms][plus][i];
	}
	parallel_transport.run(N, vout, vin, dir);
      }

      // F_nu += Pnu3 X^\dagger

      for(w=0; w<N; w++)
	force_product_sum(Pnu3[plus][plus][w], X,
			  -GJP.staple3_coeff(),
			  force[nu[w]]);

      // F_nu += X Pnu3^\dagger

      for(w=0; w<N; w++)
	force_product_sum(X, Pnu3[minus][plus][w], 
			  -GJP.staple3_coeff(),
			  force[nu[w]]);

      // This stuff is to be done once only for each value of nu[w].
      // Look for  N nu's that haven't been done before.

      bool nextn = false;
      for(w=0; w<N; w++)
	if(done[nu[w]]){
	  nextn = true;
	  break;
	}
      if(nextn) continue;
      for(w=0; w<N; w++) done[nu[w]] = true;

      // Got N new nu's, so do some stuff...
	    
      // F_nu += Pnu X^\dagger

      for(w=0; w<N; w++)
	force_product_sum(Pnu[minus][w], X,
			  GJP.KS_coeff(),
			  force[nu[w]]);

      // F_nu += Pnunu Pnu^\dagger

      for(w=0; w<N; w++)
	force_product_sum(Pnunu[minus][w], Pnu[plus][w],
			  -GJP.Naik_coeff(),
			  force[nu[w]]);

      // F_nu += Pnu Pnunu^\dagger
	    
      for(w=0; w<N; w++)
	force_product_sum(Pnu[minus][w], Pnunu[plus][w],
			  GJP.Naik_coeff(),
			  force[nu[w]]);

      // Pnununu = U_nu Pnunu

      for(int i=0; i<N; i++){
	dir[i] = n_sign*nu[i]+plus;        
	vin[i] = Pnunu[minus][i]; 
	vout[i] = Pnununu[i];
      }
      parallel_transport.run(N, vout, vin, dir);
	    
      // F_nu += Pnununu X^\dagger
		
      for(w=0; w<N; w++)
	force_product_sum(Pnununu[w], X,
			  GJP.Naik_coeff(),
			  force[nu[w]]);
		
	    

    } // nu loop
  } // mu loop


    // Now that we have computed the force, we can update the momenta

  //	nflops +=ParTrans::PTflops + ForceFlops;

#ifdef PROFILE
  dtime += dclock();
  int nflops = ParTrans::PTflops + ForceFlops;
  printf("%s:%s:",cname,fname);
  print_flops(nflops,dtime);
#endif

  Fdt = update_momenta(force, dt, mom);


  // Tidy up
    
#if 0
  sfree(Pnu);
  sfree(P3);
  sfree(Prhonu);
  sfree(P5);
  sfree(P7);
  sfree(Psigma7);
#else
  for(int w = 0;w<N;w++){
    for(int i = 0;i<n_sign;i++){
      v_free(Pnu[i][w]);
      for(int j = 0;j<n_sign;j++){
	v_free(P3[i][j][w]);
	v_free(Prhonu[i][j][w]);
	for(int k = 0;k<n_sign;k++){
	  v_free(P5[i][j][k][w]);
	  for(int l = 0;l<n_sign;l++){
	    v_free(P7[i][j][k][l][w]);
	    v_free(Psigma7[i][j][k][l][w]);
	  }
	}
      }
    }
  }
#endif

  for(int i = 0;i<4;i++) v_free(force[i]);    
  sfree(X);

  Convert(CANONICAL);
#endif   

  return Fdt;
}
コード例 #17
0
ファイル: g_wilson_force.C プロジェクト: DeanHowarth/QUDA-CPS
ForceArg Gwilson::EvolveMomGforce(Matrix *mom, Float dt){
  char *fname = "EvolveMomGforce(M*,F)";
  VRB.Func(cname,fname);
  static Matrix mt0;
  static Matrix *mp0 = &mt0;

  Float L1=0.0;
  Float L2=0.0;
  Float Linf=0.0;

#ifdef PROFILE
  Float time = -dclock();
  ForceFlops=0;
  ParTrans::PTflops=0;
#endif
  static int vol = GJP.VolNodeSites();
  const int N = 4;
  Float tmp = GJP.Beta() *invs3;
  Matrix *Unit = (Matrix *) fmalloc(vol*sizeof(Matrix));
  Matrix *tmp1[N];
  Matrix *tmp2[N];
  Matrix *result[4];
  for(int i = 0;i<4;i++){
  	result[i] = (Matrix *) fmalloc(vol*sizeof(Matrix));
  }
  for(int i = 0;i<N;i++){
  	tmp1[i] = (Matrix *) fmalloc(vol*sizeof(Matrix));
  	tmp2[i] = (Matrix *) fmalloc(vol*sizeof(Matrix));
        bzero((char *)tmp2[i],vol*sizeof(Matrix));
  }
  for(int i = 0;i<vol;i++) 
	Unit[i]=1.;
  Matrix *Units[4];
  for(int i = 0;i<N;i++) Units[i] = Unit;
  int mu,nu;
  {
    int dirs_p[] = {0,2,4,6,0,2,4};
    int dirs_m[] = {1,3,5,7,1,3,5};
    ParTransGauge pt(*this);

  
      for(nu = 1;nu<4;nu++){
        pt.run(N,tmp1,Units,dirs_m+nu);
  	pt.run(N,result,tmp1,dirs_m);
	pt.run(N,tmp1,result,dirs_p+nu);
	for(int i = 0; i<N;i++)
	vaxpy3_m(tmp2[i],&tmp,tmp1[i],tmp2[i],vol*3);
	pt.run(N,tmp1,Units,dirs_p+nu);
	pt.run(N,result,tmp1,dirs_m);
	pt.run(N,tmp1,result,dirs_m+nu);
	for(int i = 0; i<N;i++)
	vaxpy3_m(tmp2[i],&tmp,tmp1[i],tmp2[i],vol*3);
        ForceFlops +=vol*12*N;
      }
      pt.run(N,result,tmp2,dirs_p);
  }

    Matrix mp1;
    for(mu = 0;mu<4;mu++){
      Matrix *mtmp = result[mu];
      for(int i = 0;i<vol;i++) {
        mtmp->TrLessAntiHermMatrix();
        mtmp++;
      }
    }
  ForceFlops += vol*60;
  
  int x[4];
  
  for(x[0] = 0; x[0] < GJP.XnodeSites(); ++x[0]) {
    for(x[1] = 0; x[1] < GJP.YnodeSites(); ++x[1]) {
      for(x[2] = 0; x[2] < GJP.ZnodeSites(); ++x[2]) {
	for(x[3] = 0; x[3] < GJP.TnodeSites(); ++x[3]) {
	  
	  int uoff = GsiteOffset(x);
	  
	  for (int mu = 0; mu < 4; ++mu) {
	    
	    IFloat *ihp = (IFloat *)(mom+uoff+mu);
	    IFloat *dotp2 = (IFloat *) (result[mu]+(uoff/4));
	    fTimesV1PlusV2(ihp, dt, dotp2, ihp, 18);
	    Float norm = ((Matrix*)dotp2)->norm();
	    Float tmp = sqrt(norm);
	    L1 += tmp;
	    L2 += norm;
	    Linf = (tmp>Linf ? tmp : Linf);
	  }
	}
      }
    }
  }
  ForceFlops += vol*144;

#ifdef PROFILE
  time += dclock();
  print_flops(cname,fname,ForceFlops+ParTrans::PTflops,time);
#endif

  ffree(Unit);
  for(int i = 0;i<N;i++){
  ffree(tmp1[i]);
  ffree(tmp2[i]);
  }
  for(int i = 0;i<4;i++) 
  ffree(result[i]);

  glb_sum(&L1);
  glb_sum(&L2);
  glb_max(&Linf);

  L1 /= 4.0*GJP.VolSites();
  L2 /= 4.0*GJP.VolSites();

  return ForceArg(dt*L1, dt*sqrt(L2), dt*Linf);

}
コード例 #18
0
ファイル: wilson_quda.C プロジェクト: DeanHowarth/QUDA-CPS
int DiracOpWilson::QudaInvert(Vector *out, Vector *in, Float *true_res, int mat_type) {

  char *fname = "QudaInvert(V*, V*, F*, int)";

  VRB.ActivateLevel(VERBOSE_FLOW_LEVEL);

  struct timeval start, end;
  gettimeofday(&start,NULL);

  QudaGaugeParam gauge_param = newQudaGaugeParam();
  QudaInvertParam inv_param = newQudaInvertParam();

  int f_size_cb = GJP.VolNodeSites() * lat.FsiteSize() / 2;

  //--------------------------------------
  //  Parameter setting for Gauge Data
  //--------------------------------------

  // set the CUDA precisions
  gauge_param.reconstruct = setReconstruct_wil(QudaParam.reconstruct);
  gauge_param.cuda_prec = setPrecision_wil(QudaParam.gauge_prec);

  // set the CUDA sloppy precisions
  gauge_param.reconstruct_sloppy = setReconstruct_wil(QudaParam.reconstruct_sloppy);
  gauge_param.cuda_prec_sloppy = setPrecision_wil(QudaParam.gauge_prec_sloppy);

  if (sizeof(Float) == sizeof(double)) {
    gauge_param.cpu_prec = QUDA_DOUBLE_PRECISION;
    inv_param.cpu_prec = QUDA_DOUBLE_PRECISION;
  } else {
    gauge_param.cpu_prec = QUDA_SINGLE_PRECISION;
    inv_param.cpu_prec = QUDA_SINGLE_PRECISION;
  }

  gauge_param.X[0] = GJP.XnodeSites();
  gauge_param.X[1] = GJP.YnodeSites();
  gauge_param.X[2] = GJP.ZnodeSites();
  gauge_param.X[3] = GJP.TnodeSites();
  gauge_param.anisotropy = GJP.XiBare();
  gauge_param.cuda_prec_precondition = QUDA_DOUBLE_PRECISION;
  gauge_param.reconstruct_precondition = setReconstruct_wil(QudaParam.reconstruct_sloppy);

  if (GJP.XiDir() != 3) ERR.General(cname, fname, "Anisotropy direction not supported\n");
  
  //---------------------------------------------------
  // QUDA_FLOAT_GAUGE_ORDER = 1
  // QUDA_FLOAT2_GAUGE_ORDER = 2, // no reconstruct and double precision
  // QUDA_FLOAT4_GAUGE_ORDER = 4, // 8 and 12 reconstruct half and single
  // QUDA_QDP_GAUGE_ORDER, // expect *gauge[4], even-odd, row-column color
  // QUDA_CPS_WILSON_GAUGE_ORDER, // expect *gauge, even-odd, mu inside, column-row color
  // QUDA_MILC_GAUGE_ORDER, // expect *gauge, even-odd, mu inside, row-column order
  //
  // MULTI GPU case, we have to use QDP format of gauge data
  //
  //---------------------------------------------------
  
  gauge_param.gauge_order = QUDA_CPS_WILSON_GAUGE_ORDER;

  //---------------------------------------------------
  
  gauge_param.gauge_fix = QUDA_GAUGE_FIXED_NO;
  gauge_param.type = QUDA_WILSON_LINKS;

  for (int d=0; d<3; d++) if (GJP.Bc(d) != BND_CND_PRD) 
    ERR.General(cname, fname, "Boundary condition not supported\n");
  if (GJP.Tbc() == BND_CND_PRD)
    gauge_param.t_boundary = QUDA_PERIODIC_T;
  else
    gauge_param.t_boundary = QUDA_ANTI_PERIODIC_T;
  
  //------------------------------------------
  //  Parameter setting for Matrix invertion
  //------------------------------------------
  inv_param.cuda_prec = setPrecision_wil(QudaParam.spinor_prec);
  inv_param.cuda_prec_sloppy = setPrecision_wil(QudaParam.spinor_prec_sloppy);

  inv_param.maxiter = dirac_arg->max_num_iter;
  inv_param.reliable_delta = QudaParam.reliable_delta;
  
  inv_param.Ls = 1;
  //inv_param.Ls = GJP.SnodeSites();
  
  //--------------------------
  // Possible dslash type
  //--------------------------
  // QUDA_WILSON_DSLASH
  // QUDA_CLOVER_WILSON_DSLASH
  // QUDA_DOMAIN_WALL_DSLASH
  // QUDA_ASQTAD_DSLASH
  // QUDA_TWISTED_MASS_DSLASH
  //--------------------------
  inv_param.dslash_type = QUDA_WILSON_DSLASH;

  //--------------------------------
  // Possible normalization method
  //--------------------------------
  // QUDA_KAPPA_NORMALIZATION
  // QUDA_MASS_NORMALIZATION
  // QUDA_ASYMMETRIC_MASS_NORMALIZATION
  //--------------------------------
  inv_param.kappa = kappa;
  inv_param.mass_normalization = QUDA_KAPPA_NORMALIZATION;
  //inv_param.mass = dirac_arg->mass;
  //inv_param.mass_normalization = QUDA_MASS_NORMALIZATION;
  
  inv_param.dagger = QUDA_DAG_NO;

  switch (mat_type) {
  case 0:
    inv_param.solution_type = QUDA_MATPC_SOLUTION;
    break;
  case 1:
    inv_param.solution_type = QUDA_MATPCDAG_MATPC_SOLUTION;
    break;
  default:
    ERR.General(cname, fname, "Matrix solution type not defined\n");
  }

  inv_param.matpc_type = QUDA_MATPC_ODD_ODD;
  //inv_param.matpc_type = QUDA_MATPC_EVEN_EVEN;
  inv_param.preserve_source = QUDA_PRESERVE_SOURCE_NO;
  inv_param.gamma_basis = QUDA_DEGRAND_ROSSI_GAMMA_BASIS;
  //inv_param.gamma_basis = QUDA_UKQCD_GAMMA_BASIS;
  
  inv_param.dirac_order = QUDA_CPS_WILSON_DIRAC_ORDER;
  //inv_param.dirac_order = QUDA_DIRAC_ORDER;

  inv_param.input_location = QUDA_CPU_FIELD_LOCATION;
  inv_param.output_location = QUDA_CPU_FIELD_LOCATION;
  inv_param.tune = QUDA_TUNE_NO;
  inv_param.use_init_guess = QUDA_USE_INIT_GUESS_YES;

  //--------------------------
  // Possible verbose type
  //--------------------------
  // QUDA_SILENT
  // QUDA_SUMMARIZE
  // QUDA_VERBOSE
  // QUDA_DEBUG_VERBOSE
  //--------------------------
  inv_param.verbosity = QUDA_VERBOSE;

  switch (dirac_arg->Inverter) {
  case CG:
    inv_param.inv_type = QUDA_CG_INVERTER;
    inv_param.solve_type = QUDA_NORMEQ_PC_SOLVE;
    break;
  case BICGSTAB:
    inv_param.inv_type = QUDA_BICGSTAB_INVERTER;
    inv_param.solve_type = QUDA_DIRECT_PC_SOLVE;
    break;
  default:
    inv_param.inv_type = QUDA_CG_INVERTER;
    inv_param.solve_type = QUDA_NORMEQ_PC_SOLVE;
    break;
  }
  
  // domain decomposition preconditioner parameters
  inv_param.inv_type_precondition = QUDA_INVALID_INVERTER;
  inv_param.schwarz_type = QUDA_ADDITIVE_SCHWARZ;
  inv_param.precondition_cycle = 1;
  inv_param.tol_precondition = 1e-1;
  inv_param.maxiter_precondition = 10;
  inv_param.verbosity_precondition = QUDA_VERBOSE;
  inv_param.prec_precondition = QUDA_HALF_PRECISION;
  inv_param.omega = 1.0;

  
  gauge_param.ga_pad = 0; // 24*24*24/2;
  inv_param.sp_pad = 0; // 24*24*24/2;
  inv_param.cl_pad = 0; // 24*24*24/2;

#ifdef USE_QMP
  //------------------------------------------
  // This part is needed to make buffer memory
  // space for multi GPU Comm.
  //------------------------------------------
  int x_face_size = gauge_param.X[1]*gauge_param.X[2]*gauge_param.X[3]/2;
  int y_face_size = gauge_param.X[0]*gauge_param.X[2]*gauge_param.X[3]/2;
  int z_face_size = gauge_param.X[0]*gauge_param.X[1]*gauge_param.X[3]/2;
  int t_face_size = gauge_param.X[0]*gauge_param.X[1]*gauge_param.X[2]/2;
  int pad_size =MAX(x_face_size, y_face_size);
  pad_size = MAX(pad_size, z_face_size);
  pad_size = MAX(pad_size, t_face_size);
  gauge_param.ga_pad = pad_size;
#endif

  loadGaugeQuda((void*)gauge_field, &gauge_param);

  Vector *x = (Vector*)smalloc(f_size_cb * sizeof(Float));
  Vector *r = (Vector*)smalloc(f_size_cb * sizeof(Float));
  x->VecZero(f_size_cb); 
  r->VecZero(f_size_cb); 
  
  //----------------------------------------------
  //  Calculate Flops value 
  //----------------------------------------------
  Float flops = 0.0;
  Float matvec_flops = (2*1320+48)*GJP.VolNodeSites()/2;
  if (mat_type == 1) matvec_flops *= 2; // double flops since normal equations
  //----------------------------------------------

  //----------------------------------------------
  // Calculate Stop condition
  //----------------------------------------------
  Float in_norm2 = in->NormSqGlbSum(f_size_cb);
  Float stop = dirac_arg->stop_rsd * dirac_arg->stop_rsd * in_norm2;
  
  int total_iter = 0, k = 0;
  
  // Initial residual
  if (mat_type == 0)
  {
    MatPc(r,out);
  }
  else
  {
    MatPcDagMatPc(r,out);
  }

  r->FTimesV1MinusV2(1.0,in,r,f_size_cb);
  Float r2 = r->NormSqGlbSum(f_size_cb);
  flops += 4*f_size_cb + matvec_flops;

  VRB.Flow(cname, fname, "0 iterations, res^2 = %1.15e, restart = 0\n", r2);

  while (r2 > stop && k < QudaParam.max_restart) {
    inv_param.tol = dirac_arg->stop_rsd;
    if(sqrt(stop/r2)>inv_param.tol) 
    {
      inv_param.tol = sqrt(stop/r2);
    }

    x->VecZero(f_size_cb);
    //---------------------------------
    //  Inversion sequence start
    //---------------------------------
    invertQuda(x, r, &inv_param);
    
    // Update solution
    out->VecAddEquVec(x, f_size_cb);
    
    //------------------------------------
    // Calculate new residual
    if (mat_type == 0)
      MatPc(r, out);
    else
      MatPcDagMatPc(r, out);

    r->FTimesV1MinusV2(1.0,in,r,f_size_cb);
    r2 = r->NormSqGlbSum(f_size_cb);
    //------------------------------------

    k++;
    total_iter += inv_param.iter + 1;
    flops += 1e9*inv_param.gflops + 8*f_size_cb + matvec_flops;

    VRB.Flow(cname, fname, "Gflops = %e, Seconds = %e, Gflops/s = %f\n", 
             inv_param.gflops, inv_param.secs, inv_param.gflops / inv_param.secs);

    VRB.Flow(cname, fname, "True |res| / |src| = %1.15e, iter = %d, restart = %d\n",  
             sqrt(r2)/sqrt(in_norm2), total_iter, k);
  }
  
  gettimeofday(&end,NULL);
  print_flops(cname,fname,flops,&start,&end);
    
  VRB.Flow(cname, fname, "Cuda Space Required. Spinor:%f + Gauge:%f GiB\n", 
	   inv_param.spinorGiB, gauge_param.gaugeGiB);

  VRB.Flow(cname, fname, "True |res| / |src| = %1.15e, iter = %d, restart = %d\n", 
	     sqrt(r2)/sqrt(in_norm2), total_iter, k);

  if (true_res) *true_res = sqrt(r2);

  //----------------------------------------
  //  Finalize QUDA memory and API
  //----------------------------------------
  freeGaugeQuda();
  //----------------------------------------

  sfree(x);
  sfree(r);

  //VRB.DeactivateLevel(VERBOSE_FLOW_LEVEL);
  return total_iter;
}
コード例 #19
0
ファイル: pt_vec.C プロジェクト: DeanHowarth/QUDA-CPS
void PT::vec_cb_norm(int n, IFloat **vout, IFloat **vin, const int *dir,int parity, IFloat * gauge)
{
  //List of the different directions
  int wire[n];
  int i;
//  int j,d,s,k;
  //SCUDirArgs for sending and receiving in the n directions
  SCUDirArgIR *SCUarg_p[2*non_local_dirs];
  //SCUDirArgIR *SCUarg_p[2*n];
  SCUDirArgMulti SCUmulti;
  static int call_num = 0;
  int vlen = VECT_LEN;
  int vlen2 = VECT_LEN;
//  printf("gauge=%p\n",gauge);

  call_num++;
  
  //Name our function
//  char *fname="pt_1vec_cb_norm()";
  
  //Set the transfer directions
  //If wire[i] is even, then we have communication in the negative direction
  //If wire[i] is odd, then we have communication in the positive direction
  for(i=0;i<n;i++)
    wire[i]=dir[i];

  Float dtime;


  //If wire[i] is odd, then we have parallel transport in the
  //positive direction.  In this case, the matrix multiplication is
  //done before the field is transferred over to the adjacent node
  //
  //If we have transfer in the negative T direction (wire[i] = 6) then
  //we have to copy the appropriate fields into the send buffer
  if(conjugated)
    for(i=0;i<n;i++)
      {
	if(!local[wire[i]/2])
	  {
	    if(wire[i]%2)
	      {
		//printf("dir = %d, pre-mulitply\n", wire[i]);
#ifdef PROFILE
  dtime  = - dclock();
#endif

  partrans_cmv(non_local_chi_cb[wire[i]]/2,uc_nl_cb_pre[parity][wire[i]/2],gauge,vin[i],snd_buf_cb[wire[i]/2]);

#ifdef PROFILE
  dtime +=dclock();
  print_flops("",fname,66*non_local_chi_cb[wire[i]],dtime);
#endif
	      }
	    else if((wire[i] == 6))
	      {
#ifdef PROFILE
  dtime  = - dclock();
#endif
#if 1
            pt_copy_buffer(non_local_chi_cb[6],(long)vin[i],(long)snd_buf_t_cb,(long)Toffset[parity]);
#else
		for(j = 0; j < non_local_chi_cb[6];j++)
		  for(k = 0; k < VECT_LEN;k++)
		    *(snd_buf_t_cb+j*VECT_LEN+k) = *(vin[i] + *(Toffset[parity]+j)+ k);
		  //moveMem(snd_buf_t_cb + j*VECT_LEN,vin[i] + *(Toffset[parity]+j)*vlen,VECT_LEN*sizeof(IFloat));
#endif
#ifdef PROFILE
  dtime +=dclock();
  print_flops(fname,"pt_copy_buffer()",0,dtime);
#endif
	      }
	  }
      }
  else
    for(i=0;i<n;i++)
      {
	if(!local[wire[i]/2])
	  {
	    if(wire[i]%2)
	      {
#ifdef PROFILE
  dtime  = - dclock();
#endif
 
  partrans_cmv_dag(non_local_chi_cb[wire[i]]/2,uc_nl_cb_pre[parity][wire[i]/2],gauge,vin[i],snd_buf_cb[wire[i]/2]);	 

#ifdef PROFILE
  dtime +=dclock();
  print_flops("",fname,66*non_local_chi_cb[wire[i]],dtime);
#endif
	      }
	    else if(wire[i] == 6)
	      {
#ifdef PROFILE
  dtime  = - dclock();
#endif
#if 1
            pt_copy_buffer(non_local_chi_cb[6],(long)vin[i],(long)snd_buf_t_cb,(long)Toffset[parity]);
#else
		for(j = 0; j < non_local_chi_cb[6];j++)
		  for(k = 0; k < VECT_LEN;k++)
		    *(snd_buf_t_cb+j*VECT_LEN+k) = *(vin[i] + *(Toffset[parity]+j)+ k);
//		  moveMem(snd_buf_t_cb + j*VECT_LEN,vin[i] + *(Toffset[parity]+j),VECT_LEN*sizeof(IFloat));
#endif
#ifdef PROFILE
  dtime +=dclock();
  print_flops(fname,"pt_copy_buffer()",0,dtime);
#endif
	      }
	  }
      }
#ifdef PROFILE
  dtime  = - dclock();
#endif

  #if 1
  int comms = 0;
  for(i=0;i<n;i++)
    {
      if(!local[wire[i]/2])
	{
	  //Calculate the starting address for the data to be sent
	  IFloat *addr = vin[i] + VECT_LEN * offset_cb[wire[i]];
	  //This points to the appropriate SCUDirArg for receiving
	  SCUarg_p[2*comms] = SCUarg_cb[2*wire[i]];
	  //This points to the appropriate SCUDirArg for sending
	  SCUarg_p[2*comms+1] = SCUarg_cb[2*wire[i]+1];
	  
	  //Set the send address
	  if(wire[i]%2)
	    SCUarg_p[2*comms+1]->Addr((void *)snd_buf_cb[wire[i]/2]);
	  else if(wire[i] == 6)
	    SCUarg_p[2*comms+1]->Addr((void *)snd_buf_t_cb);
	  else
	    SCUarg_p[2*comms+1]->Addr((void *)addr);
	  comms++;
	}
    }
  #endif


  if(comms){
    SCUmulti.Init(SCUarg_p,2*comms);
//Begin transmission
    SCUmulti.SlowStartTrans();
  }

  //Do local calculations
  if(conjugated)
    {
      for(i=0;i<n;i++)
	{
#ifdef PROFILE
  dtime  = - dclock();
#endif 
	
  if(wire[i]%2)
    partrans_cmv(local_chi_cb[wire[i]]/2,uc_l_cb[parity][wire[i]],gauge,vin[i],vout[i]);
  else
    partrans_cmv_dag(local_chi_cb[wire[i]]/2,uc_l_cb[parity][wire[i]],gauge,vin[i],vout[i]);
  
#ifdef PROFILE
  dtime +=dclock();
  print_flops("",fname,66*local_chi_cb[wire[i]],dtime);
#endif
	}
    }
  else
    {
    for(i=0;i<n;i++)
      {
#ifdef PROFILE
  dtime  = - dclock();
#endif

  if(!(wire[i]%2))
    partrans_cmv(local_chi_cb[wire[i]]/2,uc_l_cb[parity][wire[i]],gauge,vin[i],vout[i]);
  else
    partrans_cmv_dag(local_chi_cb[wire[i]]/2,uc_l_cb[parity][wire[i]],gauge,vin[i],vout[i]);

#ifdef PROFILE
  dtime +=dclock();
  print_flops("",fname,66*local_chi_cb[wire[i]],dtime);
#endif
      }
    }
  
  //End transmission
  if(comms){ SCUmulti.TransComplete(); }

  //If wire[i] is even, then we have transport in the negative direction.
  //In this case, the vector field is multiplied by the SU(3) link matrix
  //after all communication is complete
  IFloat *fp0, *fp1;
  for(i=0;i<n;i++)
    {
      if(!local[wire[i]/2])
      	{
	  if(!(wire[i]%2))
	    {
#ifdef PROFILE
  dtime  = - dclock();
#endif

	    if(conjugated)
	      partrans_cmv_dag(non_local_chi_cb[wire[i]]/2,uc_nl_cb[parity][wire[i]],gauge,rcv_buf[wire[i]],vout[i]);
	    else
	      partrans_cmv(non_local_chi_cb[wire[i]]/2,uc_nl_cb[parity][wire[i]],gauge,rcv_buf[wire[i]],vout[i]);

#ifdef PROFILE
  dtime +=dclock();
  print_flops("",fname,66*non_local_chi_cb[wire[i]],dtime);
#endif
	    }
	  //Otherwise we have parallel transport in the positive direction.
	  //In this case, the received data has already been pre-multiplied
	  //All we need to do is to put the transported field in the correct place
	  else
	    {
#ifdef PROFILE
  dtime  = - dclock();
#endif
#if 1
              pt_copy(non_local_chi_cb[wire[i]]/2,uc_nl_cb[parity][wire[i]],rcv_buf[wire[i]],vout[i]);
#else
	      //Place the data in the receive buffer into the result vector
	      for(s=0;s<non_local_chi_cb[wire[i]];s++)
		{
		  fp0 = (IFloat *)((long)rcv_buf[wire[i]]+uc_nl_cb[parity][wire[i]][s].src);
		  fp1 = (IFloat *)((long)vout[i]+uc_nl_cb[parity][wire[i]][s].dest);
		  for(d = 0;d<VECT_LEN;d++)
		    *(fp1+d) = *(fp0+d);
		  //moveMem(fp1,fp0,VECT_LEN*sizeof(IFloat));
		}
#endif
#ifdef PROFILE
  dtime +=dclock();
  print_flops(fname,"pt_copy()",0,dtime);
#endif
	    }
	}
    }
//  ParTrans::PTflops +=33*n*vol;
}
コード例 #20
0
// CJ: change start
//------------------------------------------------------------------
// EvolveMomFforce(Matrix *mom, Vector *chi, Float mass, 
//                 Float dt):
// It evolves the canonical momentum mom by dt
// using the fermion force.
//------------------------------------------------------------------
ForceArg FdwfBase::EvolveMomFforce(Matrix *mom, Vector *chi, 
			   Float mass, Float dt){
  char *fname = "EvolveMomFforce(M*,V*,F,F,F)";
  VRB.Func(cname,fname);
  Matrix *gauge = GaugeField() ;

  if (Colors() != 3)
    ERR.General(cname,fname,"Wrong nbr of colors.") ;
 
  if (SpinComponents() != 4)
    ERR.General(cname,fname,"Wrong nbr of spin comp.") ;
 
  if (mom == 0)
    ERR.Pointer(cname,fname,"mom") ;
 
  if (chi == 0)
    ERR.Pointer(cname,fname,"chi") ;
 
  //----------------------------------------------------------------
  // allocate space for two CANONICAL fermion fields
  //----------------------------------------------------------------

  int f_size = FsiteSize() * GJP.VolNodeSites() ;
  int f_site_size_4d = 2 * Colors() * SpinComponents();
  int f_size_4d = f_site_size_4d * GJP.VolNodeSites() ;
 
  char *str_v1 = "v1" ;
  Vector *v1 = (Vector *)fmalloc(cname,fname,str_v1,f_size*sizeof(Float));
//  if (v1 == 0) ERR.Pointer(cname, fname, str_v1) ;
//  VRB.Smalloc(cname, fname, str_v1, v1, f_size*sizeof(Float)) ;

  char *str_v2 = "v2" ;
  Vector *v2 = (Vector *)fmalloc(cname,fname,str_v2,f_size*sizeof(Float)) ;
//  if (v2 == 0) ERR.Pointer(cname, fname, str_v2) ;
//  VRB.Smalloc(cname, fname, str_v2, v2, f_size*sizeof(Float)) ;

//  LatMatrix MomDiff(QFAST,4);
//  Matrix *mom_diff = MomDiff.Mat();

  Float L1 = 0.0;
  Float L2 = 0.0;
  Float Linf = 0.0;

  //----------------------------------------------------------------
  // Calculate v1, v2. Both v1, v2 must be in CANONICAL order after
  // the calculation.
  //----------------------------------------------------------------  

  VRB.Clock(cname, fname, "Before calc force vecs.\n") ;

#ifdef PROFILE
  Float time = -dclock();
  ForceFlops=0;
#endif
  {
    CgArg cg_arg ;
    cg_arg.mass = mass ;

    DiracOpDwf dwf(*this, v1, v2, &cg_arg, CNV_FRM_NO) ;
    dwf.CalcHmdForceVecs(chi) ;
    Fconvert(v1,CANONICAL,WILSON);
    Fconvert(v2,CANONICAL,WILSON);
  }
#ifdef PROFILE
  time += dclock();
  print_flops(cname,fname,ForceFlops,time);
#endif

  int mu, x, y, z, t, s, lx, ly, lz, lt, ls ;
  int size[5],surf[4],blklen[4],stride[4],numblk[4];
 
  size[0] = lx = GJP.XnodeSites() ;
  size[1] = ly = GJP.YnodeSites() ;
  size[2] = lz = GJP.ZnodeSites() ;
  size[3] = lt = GJP.TnodeSites() ;
  size[4] = ls = GJP.SnodeSites() ;

  blklen[0] = sizeof(Float)*FsiteSize()/size[4];
  numblk[0] = GJP.VolNodeSites()/size[0]*size[4];
  stride[0] = blklen[0] * (size[0]-1);
  for (int i =1;i<4;i++){
    blklen[i] = blklen[i-1] * size[i-1];
	numblk[i] = numblk[i-1] / size[i];
    stride[i] = blklen[i] * (size[i]-1);
  }
  for (int i =0;i<4;i++){
    surf[i] = GJP.VolNodeSites()/size[i];
  }

  //----------------------------------------------------------------
  // allocate buffer space for two fermion fields that are assoc
  // with only one 4-D site.
  //----------------------------------------------------------------

  unsigned long flag = QFAST;
  if (ls<4) flag|= QNONCACHE;
  char *str_site_v1 = "site_v1" ;
  char *str_site_v2 = "site_v2" ;

  Float *v1_buf[4],*v2_buf[4];
  int pos[4];
  Float *v1_buf_pos[4];
  Float *v2_buf_pos[4];
  for(int i =0;i<4;i++) {
    v1_buf[i]=(Float *)fmalloc(cname,fname,"v1_buf",surf[i]*FsiteSize()*sizeof(Float)) ;
    v2_buf[i]=(Float *)fmalloc(cname,fname,"v2_buf",surf[i]*FsiteSize()*sizeof(Float)) ;
  }

//  Matrix tmp_mat1, tmp_mat2 ;
  Matrix *tmp_mat1,*tmp_mat2;
  tmp_mat1 = (Matrix *)fmalloc(cname,fname,"tmp_mat1",sizeof(Matrix)*2);
  tmp_mat2 = tmp_mat1+1;

  SCUDirArgIR Send[4];
  SCUDirArgIR Recv[4];
  SCUDMAInst *dma[2];
  for(int i = 0;i<2;i++) dma[i] = new SCUDMAInst;
  void *addr[2];
  int f_bytes = sizeof(Float)*f_site_size_4d;
  int st_bytes = sizeof(Float)*f_size_4d - f_bytes;
  for (mu=0; mu<4; mu++){
    if ( GJP.Nodes(mu) >1){
      dma[0]->Init(v1_buf[mu],surf[mu]*ls*f_bytes);
      dma[1]->Init(v2_buf[mu],surf[mu]*ls*f_bytes);
      Recv[mu].Init(gjp_scu_dir[2*mu],SCU_REC,dma,2);
      dma[0]->Init(v1,blklen[mu],numblk[mu],stride[mu]);
      dma[1]->Init(v2,blklen[mu],numblk[mu],stride[mu]);
      Send[mu].Init(gjp_scu_dir[2*mu+1],SCU_SEND,dma,2);
    }
  }
  for(int i = 0;i<2;i++) delete dma[i];

  VRB.Clock(cname, fname, "Before loop over links.\n") ;

#ifdef PROFILE
  time = -dclock();
  ForceFlops=0;
#endif

  sys_cacheflush(0);
  for (mu=0; mu<4; mu++){
    if ( GJP.Nodes(mu) >1){
      Recv[mu].StartTrans();
      Send[mu].StartTrans();
    }
  }
  for (mu=0; mu<4; mu++){
    for (pos[3]=0; pos[3]<size[3]; pos[3]++){
    for (pos[2]=0; pos[2]<size[2]; pos[2]++){
    for (pos[1]=0; pos[1]<size[1]; pos[1]++){
    for (pos[0]=0; pos[0]<size[0]; pos[0]++){
      int gauge_offset = offset(size,pos);
      int vec_offset = f_site_size_4d*gauge_offset ;
      gauge_offset = mu+4*gauge_offset ;

//      printf("%d %d %d %d %d\n",mu,pos[0],pos[1],pos[2],pos[3]);
      Float *v1_plus_mu ;
      Float *v2_plus_mu ;
      int vec_plus_mu_stride ;
      int vec_plus_mu_offset = f_site_size_4d ;

      Float coeff = -2.0 * dt ;

          vec_plus_mu_offset *= offset(size,pos,mu);
      if ((GJP.Nodes(mu)>1)&&((pos[mu]+1) == size[mu]) ) {
      } else {
            v1_plus_mu = (Float *)v1+vec_plus_mu_offset ;
            v2_plus_mu = (Float *)v2+vec_plus_mu_offset ;
            vec_plus_mu_stride = f_size_4d - f_site_size_4d ;
            if ((pos[mu]+1) == size[mu])
            if (GJP.NodeBc(mu)==BND_CND_APRD) coeff = -coeff ;

//     Float time2 = -dclock();
      sproj_tr[mu]( (IFloat *)tmp_mat1,
                    (IFloat *)v1_plus_mu,
                    (IFloat *)v2+vec_offset,
                    ls, vec_plus_mu_stride, f_size_4d-f_site_size_4d) ;

      sproj_tr[mu+4]( (IFloat *)tmp_mat2,
                      (IFloat *)v2_plus_mu,
                      (IFloat *)v1+vec_offset,
                      ls, vec_plus_mu_stride, f_size_4d-f_site_size_4d) ;
 //    time2 += dclock();
 //    print_flops("","sproj",2*9*16*ls,time2);

      *tmp_mat1 += *tmp_mat2 ;

      // If GJP.Snodes > 1 sum up contributions from all s nodes
      if(GJP.Snodes() > 1) {
	  glb_sum_multi_dir((Float *)tmp_mat1, 4, sizeof(Matrix)/sizeof(IFloat) ) ;
      }

      tmp_mat2->DotMEqual(*(gauge+gauge_offset), *tmp_mat1) ;

      tmp_mat1->Dagger(*tmp_mat2) ;

      tmp_mat2->TrLessAntiHermMatrix(*tmp_mat1) ;

      *tmp_mat2 *= coeff ;

      *(mom+gauge_offset) += *tmp_mat2 ;

      Float norm = tmp_mat2->norm();
      Float tmp = sqrt(norm);
      L1 += tmp;
      L2 += norm;
      Linf = (tmp>Linf ? tmp : Linf);
    }

    } } } } // end for x,y,z,t
  } // end for mu
  for (mu=0; mu<4; mu++){
    if ( GJP.Nodes(mu) >1){
      Recv[mu].TransComplete();
      Send[mu].TransComplete();
    }
  }
 
//------------------------------------------------------------------
// start by summing first over direction (mu) and then over site
// to allow SCU transfers to happen face-by-face in the outermost
// loop.
//------------------------------------------------------------------

  for(int i = 0;i<4;i++){
    v1_buf_pos[i] = v1_buf[i];
    v2_buf_pos[i] = v2_buf[i];
  }

  for (mu=0; mu<4; mu++){
    for (pos[3]=0; pos[3]<size[3]; pos[3]++){
    for (pos[2]=0; pos[2]<size[2]; pos[2]++){
    for (pos[1]=0; pos[1]<size[1]; pos[1]++){
    for (pos[0]=0; pos[0]<size[0]; pos[0]++){
      int gauge_offset = offset(size,pos);
      int vec_offset = f_site_size_4d*gauge_offset ;
      gauge_offset = mu+4*gauge_offset ;

      Float *v1_plus_mu ;
      Float *v2_plus_mu ;
      int vec_plus_mu_stride ;
      int vec_plus_mu_offset = f_site_size_4d ;

      Float coeff = -2.0 * dt ;

//      printf("%d %d %d %d %d\n",mu,pos[0],pos[1],pos[2],pos[3]);
          vec_plus_mu_offset *= offset(size,pos,mu);
      if ((GJP.Nodes(mu)>1)&&((pos[mu]+1) == size[mu]) ) {
            v1_plus_mu = v1_buf_pos[mu] ;
            v2_plus_mu = v2_buf_pos[mu] ;
 			v1_buf_pos[mu] += f_site_size_4d;
 			v2_buf_pos[mu] += f_site_size_4d;
            vec_plus_mu_stride = (surf[mu] -1)*f_site_size_4d ;
            if (GJP.NodeBc(mu)==BND_CND_APRD) coeff = -coeff ;

//     Float time2 = -dclock();
      sproj_tr[mu]( (IFloat *)tmp_mat1,
                    (IFloat *)v1_plus_mu,
                    (IFloat *)v2+vec_offset,
                    ls, vec_plus_mu_stride, f_size_4d-f_site_size_4d) ;

      sproj_tr[mu+4]( (IFloat *)tmp_mat2,
                      (IFloat *)v2_plus_mu,
                      (IFloat *)v1+vec_offset,
                      ls, vec_plus_mu_stride, f_size_4d-f_site_size_4d) ;
 //    time2 += dclock();
 //    print_flops("","sproj",2*9*16*ls,time2);

      *tmp_mat1 += *tmp_mat2 ;

      // If GJP.Snodes > 1 sum up contributions from all s nodes
      if(GJP.Snodes() >1 ) {
	  glb_sum_multi_dir((Float *)tmp_mat1, 4, sizeof(Matrix)/sizeof(IFloat) ) ;
      }

      tmp_mat2->DotMEqual(*(gauge+gauge_offset), *tmp_mat1) ;

      tmp_mat1->Dagger(*tmp_mat2) ;

      tmp_mat2->TrLessAntiHermMatrix(*tmp_mat1) ;

      *tmp_mat2 *= coeff ;

      *(mom+gauge_offset) += *tmp_mat2 ;
      Float norm = tmp_mat2->norm();
      Float tmp = sqrt(norm);
      L1 += tmp;
      L2 += norm;
      Linf = (tmp>Linf ? tmp : Linf);
    }

    } } } } // end for x,y,z,t
  } // end for mu

  ForceFlops += (2*9*16*ls + 18+ 198+36+24)*lx*ly*lz*lt*4;
#ifdef PROFILE
  time += dclock();
  print_flops(cname,fname,ForceFlops,time);
#endif
 
//------------------------------------------------------------------
// deallocate smalloc'd space
//------------------------------------------------------------------

  for(int i =0;i<4;i++) {
    ffree(v1_buf[i],cname,fname,"v1_buf");
    ffree(v2_buf[i],cname,fname,"v2_buf");
  }
 
  VRB.Sfree(cname, fname, str_v2, v2) ;
  ffree(v2) ;
 
  VRB.Sfree(cname, fname, str_v1, v1) ;
  ffree(v1) ;

  ffree(tmp_mat1);
 
  glb_sum(&L1);
  glb_sum(&L2);
  glb_max(&Linf);

  L1 /= 4.0*GJP.VolSites();
  L2 /= 4.0*GJP.VolSites();

  return ForceArg(L1, sqrt(L2), Linf);
}