C++ (Cpp) moveMem Examples

Programming Language: C++ (Cpp)

Method/Function: moveMem

Examples at hotexamples.com: 6

C++ (Cpp) moveMem - 6 examples found. These are the top rated real world C++ (Cpp) examples of moveMem extracted from open source projects. You can rate examples to help us improve the quality of examples.

Example #1

Show file

File: alg_tcharge.C Project: DeanHowarth/QUDA-CPS

void CloverLeaf2x2(Lattice& lattice, Matrix& pl,  int* pos, int mu, int nu)
{
  Matrix P0,P1,P2,P3;
  // 1x2 size
  P0.ZeroMatrix();
  P1.ZeroMatrix();
  P2.ZeroMatrix();
  P3.ZeroMatrix();
  
  // each direction could be {0,1,2,3,4,5,6,7} coresponding to
  // the directions {n_x, n_y, n_z, n_t, -n_x, -n_y, -n_z, -n_t}
  int dirs0[8]={mu,mu, nu, nu, mu+4,mu+4, nu+4, nu+4};
  lattice.PathOrdProdPlus(P0, pos, dirs0, 8);
    
  int dirs1[8]={nu+4, nu+4, mu+4, mu+4, nu, nu, mu,mu };
  lattice.PathOrdProdPlus(P1, pos, dirs1, 8);
  
  int dirs2[8]={nu, nu, mu+4, mu+4, nu+4, nu+4, mu, mu };
  lattice.PathOrdProdPlus(P2, pos, dirs2, 8);
  
  int dirs3[8]={mu,mu, nu+4, nu+4, mu+4, mu+4, nu, nu };
  lattice.PathOrdProdPlus(P3, pos, dirs3, 8);
  
  P0 -=  P1;
  P0 +=  P2;
  P0 -=  P3;
  P0 *= 1.0/16;
   
  moveMem((Float*) &pl,(Float*) &P0, 18 * sizeof(Float) );
  
}

Example #2

Show file

File: alg_nuc3pt.C Project: DeanHowarth/QUDA-CPS

// multiply exp( - i Q ) in `mu' direction. 
void twist_links(Lattice &lat, const Float Q, const int mu)
{
  // multiply su3 links by u1 links
  // assumes both are in canonical order!
  // link =  exp(-i A_mu Q) where Q is the 
  // quark electric charge or twist angle
  int x[4];
  Complex cc(cos(Q), -sin(Q)); 
  
  Matrix temp;
  for(x[3]=0; x[3]<GJP.TnodeSites();x[3]++){
    for(x[2]=0; x[2]<GJP.ZnodeSites();x[2]++){
      for(x[1]=0; x[1]<GJP.YnodeSites();x[1]++){
	for(x[0]=0; x[0]<GJP.XnodeSites();x[0]++){
	  int offset_x = lat.GsiteOffset(x);
	  // the link matrix at current site, in direction mu:
	  Matrix *su3_link = lat.GaugeField() + offset_x + mu;
	  //cTimesVec((float*)&temp, *(float*)phase, *((float*)phase+1), 
	  cTimesVec((IFloat*)&temp, (IFloat)cc.real(), (IFloat)cc.imag(), 
		    (IFloat*)su3_link, 18);
	  moveMem((IFloat*)su3_link, (IFloat*)&temp, 18);
	}
      }
    }
  }
}

Example #3

Show file

File: d_op_wilson.C Project: DeanHowarth/QUDA-CPS

//------------------------------------------------------------------
// int MatInv(Vector *out, Vector *in, 
//            Float *true_res, PreserveType prs_in);
// The inverse of the unconditioned Dirac Operator 
// using Conjugate gradient.
// If true_res !=0 the value of the true residual is returned
// in true_res.
// *true_res = |src - MatPcDagMatPc * sol| / |src|
// prs_in is used to specify if the source
// in should be preserved or not. If not the memory usage
// is less by half the size of a fermion vector.
// The function returns the total number of CG iterations.
//------------------------------------------------------------------
int DiracOpWilson::MatInv(Vector *out, 
			  Vector *in, 
			  Float *true_res,
			  PreserveType prs_in) {
  char *fname = "MatInv(V*,V*,F*)";
  VRB.Func(cname,fname);
  Vector *temp2;

  int temp_size = GJP.VolNodeSites() * lat.FsiteSize() / 2;

  // check out if converted
  //for (int ii = 0; ii < 2 * temp_size; ii++) {
  //  VRB.Result(cname, fname, "in[%d] = %e\n", ii, 
  //  *((Float *)in + ii));
  //  VRB.Result(cname, fname, "out[%d] = %e\n", ii, 
  //  *((Float *)out + ii));
  //}

  Vector *temp = (Vector *) smalloc(temp_size * sizeof(Float));
  if (temp == 0) ERR.Pointer(cname, fname, "temp");
  VRB.Smalloc(cname,fname, "temp", temp, temp_size * sizeof(Float));

  if(prs_in == PRESERVE_YES){
    temp2 = (Vector *) smalloc(2*temp_size * sizeof(Float));
    if (temp2 == 0) ERR.Pointer(cname, fname, "temp2");
    VRB.Smalloc(cname,fname, "temp2", temp2, temp_size * sizeof(Float));
  }
  // save source
  if(prs_in == PRESERVE_YES){
    moveMem((Float *)temp2, (Float *)in, 2*temp_size*sizeof(Float));
  }

#if 0
{
  printf("in(before)=\n");
  IFloat *temp_p = (IFloat *)in;
  for(int ii = 0; ii< GJP.VolNodeSites();ii++){
    for(int jj = 0; jj< lat.FsiteSize();jj++){
      if (fabs(*temp_p)>1e-7){
        printf("i=%d j=%d\n",ii,jj);
        printf("%e\n",*(temp_p));
      }
      temp_p++;
    }
  }
}
#endif

  // points to the even part of fermion source 
  Vector *even_in = (Vector *) ( (Float *) in + temp_size );

  // points to the even part of fermion solution
  Vector *even_out = (Vector *) ( (Float *) out + temp_size );

  Dslash(temp, even_in, CHKB_EVEN, DAG_NO);

  fTimesV1PlusV2((Float *)temp, (Float) kappa, (Float *)temp,
    (Float *)in, temp_size);

#if 0
{
  printf("temp(before)=\n");
  IFloat *temp_p = (IFloat *)temp;
  for(int ii = 0; ii< GJP.VolNodeSites();ii++){
    for(int jj = 0; jj< lat.FsiteSize();jj++){
      if (fabs(*temp_p)>1e-7){
        printf("i=%d j=%d\n",ii,jj);
        printf("%e\n",*(temp_p));
      }
      temp_p++;
    }
  }
}
#endif

  int iter;
  switch (dirac_arg->Inverter) {
  case CG:
    MatPcDag(in, temp);
    iter = InvCg(out,in,true_res);
    break;
  case BICGSTAB:
    iter = BiCGstab(out,temp,0.0,dirac_arg->bicgstab_n,true_res);
    break;
  default:
    ERR.General(cname,fname,"InverterType %d not implemented\n",
		dirac_arg->Inverter);
  }

  Dslash(temp, out, CHKB_ODD, DAG_NO);

  fTimesV1PlusV2((Float *)even_out, (Float) kappa, (Float *)temp,
    (Float *) even_in, temp_size);

  VRB.Sfree(cname, fname, "temp", temp);
  sfree(temp);


  // restore source
  if(prs_in == PRESERVE_YES){
    moveMem((Float *)in, (Float *)temp2, 2*temp_size*sizeof(Float));
  }

#if 0
{
  printf("in(after)=\n");
  IFloat *temp_p = (IFloat *)in;
  for(int ii = 0; ii< GJP.VolNodeSites();ii++){
    for(int jj = 0; jj< lat.FsiteSize();jj++){
      if (fabs(*temp_p)>1e-7){
        printf("i=%d j=%d\n",ii,jj);
        printf("%e\n",*(temp_p));
      }
      temp_p++;
    }
  }
}
#endif

#if 0
{
  printf("temp2(after)=\n");
  IFloat *temp_p = (IFloat *)temp2;
  for(int ii = 0; ii< GJP.VolNodeSites();ii++){
    for(int jj = 0; jj< lat.FsiteSize();jj++){
      if (fabs(*temp_p)>1e-7){
        printf("i=%d j=%d\n",ii,jj);
        printf("%e\n",*(temp_p));
      }
      temp_p++;
    }
  }
}
#endif

  if(prs_in == PRESERVE_YES){
    VRB.Sfree(cname, fname, "temp2", temp2);
    sfree(temp2);
  }

  return iter;
}

Example #4

Show file

File: d_op_wilson.C Project: DeanHowarth/QUDA-CPS

//------------------------------------------------------------------
// int MatInv(Vector *out, Vector *in, 
//            Float *true_res, PreserveType prs_in);
// The inverse of the unconditioned Dirac Operator 
// using Conjugate gradient.
// If true_res !=0 the value of the true residual is returned
// in true_res.
// *true_res = |src - MatPcDagMatPc * sol| / |src|
// prs_in is used to specify if the source
// in should be preserved or not. If not the memory usage
// is less by half the size of a fermion vector.
// The function returns the total number of CG iterations.
//------------------------------------------------------------------
int DiracOpWilson::MatInv(Vector *out, 
			  Vector *in, 
			  Float *true_res,
			  PreserveType prs_in) {
  char *fname = "MatInv(V*,V*,F*)";
  VRB.Func(cname,fname);
  Vector *temp2;

  int temp_size = GJP.VolNodeSites() * lat.FsiteSize() / 2;

  // check out if converted
  //for (int ii = 0; ii < 2 * temp_size; ii++) {
  //  VRB.Result(cname, fname, "in[%d] = %e\n", ii, 
  //  *((IFloat *)in + ii));
  //  VRB.Result(cname, fname, "out[%d] = %e\n", ii, 
  //  *((IFloat *)out + ii));
  //}

  Vector *temp = (Vector *) smalloc(temp_size * sizeof(Float));
  if (temp == 0) ERR.Pointer(cname, fname, "temp");
  VRB.Smalloc(cname,fname, "temp", temp, temp_size * sizeof(Float));

  if(prs_in == PRESERVE_YES){
    temp2 = (Vector *) smalloc(temp_size * sizeof(Float));
    if (temp2 == 0) ERR.Pointer(cname, fname, "temp2");
    VRB.Smalloc(cname,fname, "temp2", temp2, temp_size * sizeof(Float));
  }

  // points to the even part of fermion source 
  Vector *even_in = (Vector *) ( (IFloat *) in + temp_size );

  // points to the even part of fermion solution
  Vector *even_out = (Vector *) ( (IFloat *) out + temp_size );

  Dslash(temp, even_in, CHKB_EVEN, DAG_NO);

  fTimesV1PlusV2((IFloat *)temp, (IFloat) kappa, (IFloat *)temp,
    (IFloat *)in, temp_size);

  // save source
  if(prs_in == PRESERVE_YES){
    moveMem((IFloat *)temp2, (IFloat *)in, 
		temp_size * sizeof(IFloat) / sizeof(char));
  }

  MatPcDag(in, temp);

  int iter = InvCg(out,in,true_res);

  // restore source
  if(prs_in == PRESERVE_YES){
    moveMem((IFloat *)in, (IFloat *)temp2, 
		temp_size * sizeof(IFloat) / sizeof(char));
  }

  Dslash(temp, out, CHKB_ODD, DAG_NO);

  fTimesV1PlusV2((IFloat *)even_out, (IFloat) kappa, (IFloat *)temp,
    (IFloat *) even_in, temp_size);

  VRB.Sfree(cname, fname, "temp", temp);
  sfree(temp);

  if(prs_in == PRESERVE_YES){
    VRB.Sfree(cname, fname, "temp2", temp2);
    sfree(temp2);
  }

  return iter;
}

Example #5

Show file

File: p4_Fforce.C Project: DeanHowarth/QUDA-CPS

ForceArg Fp4::EvolveMomFforce(Matrix *mom, Vector *frm, Float mass, Float dt){
  char *fname = "EvolveMomFforce(M*,V*,F,F,F)";
  ERR.NotImplemented(cname,fname);
  ForceArg  Fdt;

#if 0
  VRB.Func(cname,fname);

#ifdef PROFILE
  Float dtime;
  ParTrans::PTflops=0;
  ForceFlops=0;
#endif
  size_t size;
  //	int nflops=0;
  static int vax_len = 0;
  if (vax_len == 0)
    vax_len = GJP.VolNodeSites()*VECT_LEN/VAXPY_UNROLL;

  size = GJP.VolNodeSites()/2*FsiteSize()*sizeof(Float);
  Vector *X = (Vector *)smalloc(2*size);
  //    printf("X=%p\n",X);
  Vector *X_e = X;                             // even sites
  Vector *X_o = X+GJP.VolNodeSites()/2;  // odd sites

  // The argument frm should have the CG solution.
  // The FstagTypes protected pointer f_tmp should contain Dslash frm

  moveMem(X_e, frm, size);
#ifdef DEBUGGING
  f_tmp = frm+GJP.VolNodeSites()/2; // debugging only
#endif
  moveMem(X_o, f_tmp, size);
  Fconvert(X, CANONICAL, STAG);

  Convert(STAG);  // Puts staggered phases into gauge field.
    
    
  int N; // N can be 1, 2 or 4.
  N = 4;
  if (GJP.VolNodeSites()>256)
    N = 2;  
  else if (GJP.VolNodeSites()>512)
    N = 1;
  VRB.Flow(cname,fname,"N=%d\n",N);

  enum{plus=0, minus=1, n_sign=2};

  // Array in which to accumulate the force term:
  // this must be initialised to zero 
#if 0
  Matrix **force = (Matrix**)amalloc(sizeof(Matrix), 2, 4, GJP.VolNodeSites());
  if(!force) ERR.Pointer(cname, fname, "force");
#else
  size = GJP.VolNodeSites()*sizeof(Matrix);
  Matrix *force[4];
  for(int i = 0;i<4;i++)
    force[i] = (Matrix *)v_alloc("force[i]",size);
#endif
  for(int i=0; i<4; i++)
    for(int s=0; s<GJP.VolNodeSites(); s++) force[i][s].ZeroMatrix();
  ParTransAsqtad parallel_transport(*this);


  // Vector arrays for which we must allocate memory

#if 0
  Vector ***Pnu = (Vector***)amalloc(sizeof(Vector), 3, n_sign, N, GJP.VolNodeSites());
  if(!Pnu) ERR.Pointer(cname, fname, "Pnu");
    
  Vector ****P3 = (Vector****)amalloc(sizeof(Vector), 4, n_sign, n_sign, N, GJP.VolNodeSites());
  if(!P3) ERR.Pointer(cname, fname, "P3");
  Vector ****Prhonu = (Vector****)amalloc(sizeof(Vector), 4, n_sign, n_sign, N, GJP.VolNodeSites());
  if(!Prhonu) ERR.Pointer(cname, fname, "Prhonu");
  Vector *****P5 = (Vector*****)amalloc(sizeof(Vector), 5, n_sign, n_sign, n_sign, N, GJP.VolNodeSites());
  if(!P5) ERR.Pointer(cname, fname, "P5");
  Vector ******P7 = (Vector******)amalloc(sizeof(Vector), 6, n_sign, n_sign, n_sign, n_sign, N, GJP.VolNodeSites());
  if(!P7) ERR.Pointer(cname, fname, "P7");
  Vector ******Psigma7 = (Vector******)amalloc(sizeof(Vector), 6, n_sign, n_sign, n_sign, n_sign, N, GJP.VolNodeSites());
  if(!Psigma7) ERR.Pointer(cname, fname, "Psigma7");

    
  // These vectors can be overlapped with previously allocated memory
    
  Vector **Pnununu = Prhonu[0][0];
  Vector ***Pnunu = Psigma7[0][0][0];;
  Vector ****Pnu5 = P7[0][0];
  Vector ****Pnu3 = P7[0][0];
  Vector *****Prho5 = Psigma7[0];
  Vector *****Psigmarhonu = Psigma7[0];
#else
  size = GJP.VolNodeSites()*sizeof(Vector);
  Vector *Pnu[n_sign][N];
  Vector *P3[n_sign][n_sign][N];
  Vector *Prhonu[n_sign][n_sign][N];
  Vector *P5[n_sign][n_sign][n_sign][N];
  Vector *P7[n_sign][n_sign][n_sign][n_sign][N];
  Vector *Psigma7[n_sign][n_sign][n_sign][n_sign][N];
  Vector *Pnununu[N];
  Vector *Pnunu[n_sign][N];
  Vector *Pnu5[n_sign][n_sign][N];
  Vector *Pnu3[n_sign][n_sign][N];
  Vector *Prho5[n_sign][n_sign][n_sign][N];
  Vector *Psigmarhonu[n_sign][n_sign][n_sign][N];
  //printf("Pnu=%p Psigmarhonu=%p\n",Pnu,Psigmarhonu);

  for(int w = 0;w<N;w++){
    for(int i = 0;i<n_sign;i++){
      Pnu[i][w]= (Vector *)v_alloc("Pnu",size);
      for(int j = 0;j<n_sign;j++){
	P3[i][j][w]= (Vector *)v_alloc("P3",size);
	Prhonu[i][j][w]= (Vector *)v_alloc("Prhonu",size);
	for(int k = 0;k<n_sign;k++){
	  P5[i][j][k][w]= (Vector *)v_alloc("P5",size);
	  for(int l = 0;l<n_sign;l++){
	    P7[i][j][k][l][w]= (Vector *)v_alloc("P7",size);
	    Psigma7[i][j][k][l][w]= (Vector *)v_alloc("Psigma7",size);
	  }
	  Prho5[i][j][k][w] = Psigma7[0][i][j][k][w];
	  Psigmarhonu[i][j][k][w] = Psigma7[0][i][j][k][w];
	}
	Pnu5[i][j][w]=P7[0][0][i][j][w];
	Pnu3[i][j][w]=P7[0][0][i][j][w];
      }
      Pnunu[i][w]=Psigma7[0][0][0][i][w];
    }
    Pnununu[w]=Prhonu[0][0][w];
  }

#endif
    


  // input/output arrays for the parallel transport routines
  Vector *vin[n_sign*N], *vout[n_sign*N];
  int dir[n_sign*N];
	
  int mu[N], nu[N], rho[N], sigma[N];   // Sets of directions
  int w;                                // The direction index 0...N-1
  int ms, ns, rs, ss;                   // Sign of direction
  bool done[4] = {false,false,false,false};  // Flags to tell us which 
  // nu directions we have done.
	    
#ifdef PROFILE
  dtime = -dclock();
#endif
  for (int m=0; m<4; m+=N){                     	    // Loop over mu
    for(w=0; w<N; w++) mu[w] = (m+w)%4; 

    for (int n=m+1; n<m+4; n++){                        // Loop over nu
      for(w=0; w<N; w++) nu[w] = (n+w)%4;

      // Pnu = U_nu X

      for(int i=0; i<N; i++){
	vin[i] = vin[i+N] = X;
	dir[n_sign*i] = n_sign*nu[i]+plus;        // nu_i
	dir[n_sign*i+1] = n_sign*nu[i]+minus;    // -nu_i
	vout[n_sign*i] = Pnu[minus][i];
	vout[n_sign*i+1] = Pnu[plus][i];
      }
      parallel_transport.run(n_sign*N, vout, vin, dir);

      // P3 = U_mu Pnu
      // ms is the nu sign index, ms is the mu sign index,
      // w is the direction index
      for(int i=0; i<N; i++){
	dir[n_sign*i] = n_sign*mu[i]+plus;        // mu_i
	dir[n_sign*i+1] = n_sign*mu[i]+minus;    // -mu_i
      }
      for(ns=0; ns<n_sign; ns++){               // ns is the sign of nu
	for(int i=0; i<N; i++){
	  vin[n_sign*i] = vin[n_sign*i+1] = Pnu[ns][i];
	  vout[n_sign*i] = P3[plus][ns][i];
	  vout[n_sign*i+1] = P3[minus][ns][i];
	}
	parallel_transport.run(n_sign*N, vout, vin, dir);
      }
	    
      for(w=0; w<N; w++)
	for(ns=0; ns<n_sign; ns++){
	  force_product_sum(P3[plus][ns][w], Pnu[ns][w],
			    GJP.staple3_coeff(),
			    force[mu[w]]);
	}

      for(int r=n+1; r<n+4; r++){                     // Loop over rho
	bool nextr = false;
	for(w=0; w<N; w++){
	  rho[w] = (r+w)%4;		
	  if(rho[w]==mu[w]){
	    nextr = true;
	    break;
	  }
	}
	if(nextr) continue;

	for(w=0; w<N; w++){                         // sigma
	  for(int s=rho[w]+1; s<rho[w]+4; s++){
	    sigma[w] = s%4;
	    if(sigma[w]!=mu[w] && sigma[w]!=nu[w]) break;
	  }
	}

	// Prhonu = U_rho Pnu 

	for(int i=0; i<N; i++){
	  dir[n_sign*i] = n_sign*rho[i]+plus;        
	  dir[n_sign*i+1] = n_sign*rho[i]+minus;    
	}
	for(ns=0; ns<n_sign; ns++){
	  for(int i=0; i<N; i++){
	    vin[n_sign*i] = vin[n_sign*i+1] = Pnu[ns][i];
	    vout[n_sign*i] = Prhonu[ns][minus][i];
	    vout[n_sign*i+1] = Prhonu[ns][plus][i];
	  }
	  parallel_transport.run(n_sign*N, vout, vin, dir);
	}

	// P5 = U_mu Prhonu

	for(int i=0; i<N; i++){
	  dir[n_sign*i] = n_sign*mu[i]+plus;        
	  dir[n_sign*i+1] = n_sign*mu[i]+minus;    
	}
	for(ns=0; ns<n_sign; ns++) for(rs=0; rs<n_sign; rs++) {
	  for(int i=0; i<N; i++){
	    vin[n_sign*i] = vin[n_sign*i+1] = Prhonu[ns][rs][i];
	    vout[n_sign*i] = P5[plus][ns][rs][i];
	    vout[n_sign*i+1] = P5[minus][ns][rs][i];
	  }
	  parallel_transport.run(n_sign*N, vout, vin, dir);
	}

	// F_mu += P5 Prhonu^dagger
		      
	for(w=0; w<N; w++)
	  for(ns=0; ns<n_sign; ns++) for(rs=0; rs<n_sign; rs++)
	    force_product_sum(P5[plus][ns][rs][w],
			      Prhonu[ns][rs][w],
			      GJP.staple5_coeff(),
			      force[mu[w]]);

	// Psigmarhonu = U_sigma P_rhonu
		
	for(int i=0; i<N; i++){
	  dir[n_sign*i] = (n_sign*sigma[i]);        
	  dir[n_sign*i+1] = (n_sign*sigma[i]+1);    
	}
	for(ns=0; ns<n_sign; ns++) for(rs=0; rs<n_sign; rs++){
	  for(int i=0; i<N; i++){
	    vin[n_sign*i] = vin[n_sign*i+1] = Prhonu[ns][rs][i];
	    vout[n_sign*i] = Psigmarhonu[ns][rs][minus][i];
	    vout[n_sign*i+1] = Psigmarhonu[ns][rs][plus][i];
	  }
	  parallel_transport.run(n_sign*N, vout, vin, dir);
	}

	// P7 = U_mu P_sigmarhonu
	for(int i=0; i<N; i++){
	  dir[n_sign*i] = n_sign*mu[i]+plus;        
	  dir[n_sign*i+1] = n_sign*mu[i]+minus;    
	}
	for(ns=0; ns<n_sign; ns++) for(rs=0; rs<n_sign; rs++) for(ss=0; ss<n_sign; ss++){
	  for(int i=0; i<N; i++){
	    vin[n_sign*i] = vin[n_sign*i+1] = Psigmarhonu[ns][rs][ss][i];
	    vout[n_sign*i] = P7[plus][ns][rs][ss][i];
	    vout[n_sign*i+1] = P7[minus][ns][rs][ss][i];
	  }
	  parallel_transport.run(n_sign*N, vout, vin, dir);
	}

	// F_mu -= P7 Psigmarhonu^\dagger
		
	for(w=0; w<N; w++)
	  for(ns=0; ns<n_sign; ns++) for(rs=0; rs<n_sign; rs++) for(ss=0; ss<n_sign; ss++)
	    force_product_sum(P7[plus][ns][rs][ss][w],
			      Psigmarhonu[ns][rs][ss][w],
			      GJP.staple7_coeff(),
			      force[mu[w]]);

	// F_sigma += P7 Psigmarhonu^\dagger
	// N.B. this is the same as one of the previous products.
		
	for(w=0; w<N; w++)
	  for(ns=0; ns<n_sign; ns++) for(rs=0; rs<n_sign; rs++) 
	    force_product_sum(P7[plus][ns][rs][minus][w],
			      Psigmarhonu[ns][rs][minus][w],
			      -GJP.staple7_coeff(),
			      force[sigma[w]]);

	// F_sigma += Psigmarhonu P7^\dagger
		
	for(w=0; w<N; w++)
	  for(ns=0; ns<n_sign; ns++) for(rs=0; rs<n_sign; rs++) 
	    force_product_sum(Psigmarhonu[ns][rs][minus][w],
			      P7[minus][ns][rs][minus][w],
			      -GJP.staple7_coeff(),
			      force[sigma[w]]);

	// Psigma7 = U_sigma P7 
	for(int i=0; i<N; i++){
	  dir[n_sign*i] = (n_sign*sigma[i]);        
	  dir[n_sign*i+1] = (n_sign*sigma[i]+1);    
	}
	for(ms=0; ms<n_sign; ms++) for(ns=0; ns<n_sign; ns++) for(rs=0; rs<n_sign; rs++){
	  for(int i=0; i<N; i++){
	    vin[n_sign*i] = P7[ms][ns][rs][plus][i];
	    vin[n_sign*i+1] = P7[ms][ns][rs][minus][i];
	    vout[n_sign*i] = Psigma7[ms][ns][rs][plus][i];
	    vout[n_sign*i+1] = Psigma7[ms][ns][rs][minus][i];
	  }
	  parallel_transport.run(n_sign*N, vout, vin, dir);
	}

	// F_sigma += Fsigma7 Frhonu^\dagger

	for(w=0; w<N; w++)
	  for(ns=0; ns<n_sign; ns++) for(rs=0; rs<n_sign; rs++) 
	    force_product_sum(Psigma7[plus][ns][rs][plus][w],
			      Prhonu[ns][rs][w],
			      -GJP.staple7_coeff(),
			      force[sigma[w]]);

	// F_sigma += Frhonu Fsigma7^\dagger

	for(w=0; w<N; w++)
	  for(ns=0; ns<n_sign; ns++) for(rs=0; rs<n_sign; rs++) 
	    force_product_sum(Prhonu[ns][rs][w],
			      Psigma7[minus][ns][rs][plus][w],
			      -GJP.staple7_coeff(),
			      force[sigma[w]]);

	// P5 += c_7/c_5 Psigma7

	if(GJP.staple5_coeff()!=0.0){
	  Float c75 = -GJP.staple7_coeff()/GJP.staple5_coeff();
	  for(ms=0; ms<n_sign; ms++) 
	    for(ns=0; ns<n_sign; ns++) 
	      for(rs=0; rs<n_sign; rs++) 
		for(ss=0; ss<n_sign; ss++) 
		  for(w=0; w<N; w++)
		    vaxpy3(P5[ms][ns][rs][w],&c75, Psigma7[ms][ns][rs][ss][w], P5[ms][ns][rs][w], vax_len);
	  //			P5[ms][ns][rs][w]->FTimesV1PlusV2(-GJP.staple7_coeff()/GJP.staple5_coeff(), Psigma7[ms][ns][rs][ss][w], P5[ms][ns][rs][w], GJP.VolNodeSites()*VECT_LEN);
	  ForceFlops += 2*GJP.VolNodeSites()*VECT_LEN*N*n_sign*n_sign*n_sign*n_sign;
	}

	// F_rho -= P5 Prhonu^\dagger
	for(w=0; w<N; w++)
	  for(ns=0; ns<n_sign; ns++)
	    force_product_sum(P5[plus][ns][minus][w],
			      Prhonu[ns][minus][w],
			      -GJP.staple5_coeff(),
			      force[rho[w]]);

	// F_rho -= Prhonu P5^\dagger
		    
	for(w=0; w<N; w++)
	  for(ns=0; ns<n_sign; ns++)
	    force_product_sum(Prhonu[ns][minus][w],
			      P5[minus][ns][minus][w],
			      -GJP.staple5_coeff(),
			      force[rho[w]]);

	// Prho5 = U_rho P5

	for(int i=0; i<N; i++){
	  dir[n_sign*i] = n_sign*rho[i]+plus;        
	  dir[n_sign*i+1] = n_sign*rho[i]+minus;    
	}
	for(ms=0; ms<n_sign; ms++) for(ns=0; ns<n_sign; ns++){
	  for(int i=0; i<N; i++){
	    vin[n_sign*i] = P5[ms][ns][plus][i];
	    vin[n_sign*i+1] = P5[ms][ns][minus][i];
	    vout[n_sign*i] = Prho5[ms][ns][plus][i];
	    vout[n_sign*i+1] = Prho5[ms][ns][minus][i];
	  }
	  parallel_transport.run(n_sign*N, vout, vin, dir);
	}

	// F_rho -= Prho5 Pnu^\dagger
		
	for(w=0; w<N; w++)
	  for(ns=0; ns<n_sign; ns++)
	    force_product_sum(Prho5[plus][ns][plus][w],
			      Pnu[ns][w],
			      -GJP.staple5_coeff(),
			      force[rho[w]]);

	// F_rho -= Pnu Prho5^\dagger
		
	for(w=0; w<N; w++)
	  for(ns=0; ns<n_sign; ns++)
	    force_product_sum(Pnu[ns][w],
			      Prho5[minus][ns][plus][w],
			      -GJP.staple5_coeff(),
			      force[rho[w]]);
		
	// P3 += c_5/c_3 Prho5

	if(GJP.staple3_coeff()!=0.0){		
	  Float c53 = -GJP.staple5_coeff()/GJP.staple3_coeff();
	  for(ms=0; ms<n_sign; ms++) 
	    for(ns=0; ns<n_sign; ns++) 
	      for(rs=0; rs<n_sign; rs++) 
		for(w=0; w<N; w++)
		  vaxpy3(P3[ms][ns][w],&c53,Prho5[ms][ns][rs][w], P3[ms][ns][w], vax_len);
	  //			P3[ms][ns][w]->FTimesV1PlusV2(-GJP.staple5_coeff()/GJP.staple3_coeff(), Prho5[ms][ns][rs][w], P3[ms][ns][w], GJP.VolNodeSites()*VECT_LEN);
	  ForceFlops += 2*GJP.VolNodeSites()*VECT_LEN*N*n_sign*n_sign*n_sign;
	}

      } // rho+sigma loop

      // Pnunu = U_nu Pnu

      for(int i=0; i<N; i++){
	dir[n_sign*i] = n_sign*nu[i]+plus;        
	dir[n_sign*i+1] = n_sign*nu[i]+minus;    
      }
      for(int i=0; i<N; i++){
	vin[n_sign*i] = Pnu[minus][i];
	vin[n_sign*i+1] = Pnu[plus][i];
	vout[n_sign*i] = Pnunu[minus][i];
	vout[n_sign*i+1] = Pnunu[plus][i];
      }
      parallel_transport.run(n_sign*N, vout, vin, dir);

      // P5 = U_mu Pnunu

      for(int i=0; i<N; i++){
	dir[n_sign*i] = n_sign*mu[i]+plus;        
	dir[n_sign*i+1] = n_sign*mu[i]+minus;    
      }
      for(ns=0; ns<n_sign; ns++){
	for(int i=0; i<N; i++){
	  vin[n_sign*i] = Pnunu[ns][i];
	  vin[n_sign*i+1] = Pnunu[ns][i];
	  vout[n_sign*i] = P5[plus][ns][0][i];
	  vout[n_sign*i+1] = P5[minus][ns][0][i];
	}
	parallel_transport.run(n_sign*N, vout, vin, dir);
      }

      // F_mu += P5 Pnunu^\dagger

      for(w=0; w<N; w++)
	for(ns=0; ns<n_sign; ns++)
	  force_product_sum(P5[plus][ns][0][w],
			    Pnunu[ns][w],
			    GJP.Lepage_coeff(),
			    force[mu[w]]);

      // F_nu -= P5 Pnunu^\dagger
      // N.B. this is the same as one of the previous products
	    
      for(w=0; w<N; w++)
	force_product_sum(P5[plus][minus][0][w],
			  Pnunu[minus][w],
			  -GJP.Lepage_coeff(),
			  force[nu[w]]);
	    
      // F_nu -= Pnunu P5^\dagger
	    
      for(w=0; w<N; w++)
	force_product_sum(Pnunu[minus][w],
			  P5[minus][minus][0][w],
			  -GJP.Lepage_coeff(),
			  force[nu[w]]);

      // Pnu5 = U_nu P5

      for(int i=0; i<N; i++){
	dir[n_sign*i] = n_sign*nu[i]+plus;        
	dir[n_sign*i+1] = n_sign*nu[i]+minus;    
      }
      for(ms=0; ms<n_sign; ms++){
	for(int i=0; i<N; i++){
	  vin[n_sign*i] =   P5[ms][plus][0][i]; 
	  vin[n_sign*i+1] = P5[ms][minus][0][i];
	  vout[n_sign*i] =   Pnu5[ms][plus][i];
	  vout[n_sign*i+1] = Pnu5[ms][minus][i];
	}
	parallel_transport.run(n_sign*N, vout, vin, dir);
      }

      // F_nu -= Pnu5 Pnu^\dagger

      for(w=0; w<N; w++)
	force_product_sum(Pnu5[plus][plus][w],
			  Pnu[plus][w],
			  -GJP.Lepage_coeff(),
			  force[nu[w]]);

      // F_nu -= Pnu Pnu5^\dagger

      for(w=0; w<N; w++)
	force_product_sum(Pnu[plus][w],
			  Pnu5[minus][plus][w],
			  -GJP.Lepage_coeff(),
			  force[nu[w]]);

      // P3 += c_L/c_3 Pnu5

      if(GJP.staple3_coeff()!=0.0){
	Float cl3 = -GJP.Lepage_coeff()/GJP.staple3_coeff();
	for(ms=0; ms<n_sign; ms++) 
	  for(ns=0; ns<n_sign; ns++) 
	    for(w=0; w<N; w++)
	      vaxpy3(P3[ms][ns][w],&cl3,Pnu5[ms][ns][w],P3[ms][ns][w], vax_len);
	//		   		P3[ms][ns][w]->FTimesV1PlusV2(-GJP.Lepage_coeff()/GJP.staple3_coeff(), Pnu5[ms][ns][w], P3[ms][ns][w], GJP.VolNodeSites()*VECT_LEN);
	ForceFlops += 2*GJP.VolNodeSites()*VECT_LEN*N*n_sign*n_sign;
      }

      // F_nu += P3 Pnu^\dagger

      for(w=0; w<N; w++)
	force_product_sum(P3[plus][minus][w],
			  Pnu[minus][w],
			  -GJP.staple3_coeff(),
			  force[nu[w]]);

      // F_nu +=  Pnu P3^\dagger

      for(w=0; w<N; w++)
	force_product_sum(Pnu[minus][w],
			  P3[minus][minus][w],
			  -GJP.staple3_coeff(),
			  force[nu[w]]);
	    
      // Pnu3 = U_nu P3

      for(int i=0; i<N; i++)
	dir[i] = n_sign*nu[i]+plus;        
      for(ms=0; ms<n_sign; ms++){
	for(int i=0; i<N; i++){
	  vin[i] = P3[ms][plus][i]; 
	  vout[i] = Pnu3[ms][plus][i];
	}
	parallel_transport.run(N, vout, vin, dir);
      }

      // F_nu += Pnu3 X^\dagger

      for(w=0; w<N; w++)
	force_product_sum(Pnu3[plus][plus][w], X,
			  -GJP.staple3_coeff(),
			  force[nu[w]]);

      // F_nu += X Pnu3^\dagger

      for(w=0; w<N; w++)
	force_product_sum(X, Pnu3[minus][plus][w], 
			  -GJP.staple3_coeff(),
			  force[nu[w]]);

      // This stuff is to be done once only for each value of nu[w].
      // Look for  N nu's that haven't been done before.

      bool nextn = false;
      for(w=0; w<N; w++)
	if(done[nu[w]]){
	  nextn = true;
	  break;
	}
      if(nextn) continue;
      for(w=0; w<N; w++) done[nu[w]] = true;

      // Got N new nu's, so do some stuff...
	    
      // F_nu += Pnu X^\dagger

      for(w=0; w<N; w++)
	force_product_sum(Pnu[minus][w], X,
			  GJP.KS_coeff(),
			  force[nu[w]]);

      // F_nu += Pnunu Pnu^\dagger

      for(w=0; w<N; w++)
	force_product_sum(Pnunu[minus][w], Pnu[plus][w],
			  -GJP.Naik_coeff(),
			  force[nu[w]]);

      // F_nu += Pnu Pnunu^\dagger
	    
      for(w=0; w<N; w++)
	force_product_sum(Pnu[minus][w], Pnunu[plus][w],
			  GJP.Naik_coeff(),
			  force[nu[w]]);

      // Pnununu = U_nu Pnunu

      for(int i=0; i<N; i++){
	dir[i] = n_sign*nu[i]+plus;        
	vin[i] = Pnunu[minus][i]; 
	vout[i] = Pnununu[i];
      }
      parallel_transport.run(N, vout, vin, dir);
	    
      // F_nu += Pnununu X^\dagger
		
      for(w=0; w<N; w++)
	force_product_sum(Pnununu[w], X,
			  GJP.Naik_coeff(),
			  force[nu[w]]);
		
	    

    } // nu loop
  } // mu loop


    // Now that we have computed the force, we can update the momenta

  //	nflops +=ParTrans::PTflops + ForceFlops;

#ifdef PROFILE
  dtime += dclock();
  int nflops = ParTrans::PTflops + ForceFlops;
  printf("%s:%s:",cname,fname);
  print_flops(nflops,dtime);
#endif

  Fdt = update_momenta(force, dt, mom);


  // Tidy up
    
#if 0
  sfree(Pnu);
  sfree(P3);
  sfree(Prhonu);
  sfree(P5);
  sfree(P7);
  sfree(Psigma7);
#else
  for(int w = 0;w<N;w++){
    for(int i = 0;i<n_sign;i++){
      v_free(Pnu[i][w]);
      for(int j = 0;j<n_sign;j++){
	v_free(P3[i][j][w]);
	v_free(Prhonu[i][j][w]);
	for(int k = 0;k<n_sign;k++){
	  v_free(P5[i][j][k][w]);
	  for(int l = 0;l<n_sign;l++){
	    v_free(P7[i][j][k][l][w]);
	    v_free(Psigma7[i][j][k][l][w]);
	  }
	}
      }
    }
  }
#endif

  for(int i = 0;i<4;i++) v_free(force[i]);    
  sfree(X);

  Convert(CANONICAL);
#endif   

  return Fdt;
}

Example #6

Show file

File: tree.c Project: B-Rich/dart

void
wrapPrint(FILE *fp, char *s)
{

#define LINEWIDTH 80

  static char printBuf[2*LINEWIDTH];

  if(!s) { /* s==NULL is the last call, dump whatever is remaining */

    fprintf(fp, "%s\n", printBuf);
    printBuf[0]=0;
    return;

  } else {

    if(strlen(printBuf)+strlen(s)>(2*LINEWIDTH-1))
      {
	fprintf(stderr, "FATAL ERROR--Out of static space\n"
		"weighbor:tree:wrapPrint:: length=%d\n",
		(int) (strlen(printBuf)+strlen(s)));
	exit(1);
      }
    
    strcat(printBuf, s);
    
    if(strlen(printBuf) >= LINEWIDTH) {
      
      char *breakpoint;
      
      /*
	First try to find a common to break the line
	*/
      
      breakpoint = strrchr(printBuf, ',');
      
      if(breakpoint) {
	
	*breakpoint=0;
	fprintf(fp, "%s,\n", printBuf);
	++breakpoint;
	moveMem(printBuf, breakpoint, strlen(breakpoint)+1);
	
      } else {
	
	/*
	  try a ')'
	  */
	
	breakpoint = strrchr(printBuf, ')');
	
	if(breakpoint) {
	  
	  *breakpoint=0;
	  fprintf(fp, "%s)\n", printBuf);
	  ++breakpoint;
	  moveMem(printBuf, breakpoint, strlen(breakpoint)+1);
	  
	} else {
	  
	  /* Break at line break */
	  
	  char tmpc = printBuf[LINEWIDTH];
	  printBuf[LINEWIDTH] = 0;
	  
	  fprintf(fp, "%s\n", printBuf);
	  
	  printBuf[LINEWIDTH]=tmpc;
	  moveMem(printBuf, &(printBuf[LINEWIDTH]), 
		  strlen(&(printBuf[LINEWIDTH]))+1);
	  
	}
      }
      
    }
  }
}