Exemplo n.º 1
0
static int
unitarize_link_test()
{

  QudaGaugeParam qudaGaugeParam = newQudaGaugeParam();

  initQuda(0);

  cpu_prec = prec;
  gSize = cpu_prec;  
  qudaGaugeParam.anisotropy = 1.0;
  
  qudaGaugeParam.X[0] = xdim;
  qudaGaugeParam.X[1] = ydim;
  qudaGaugeParam.X[2] = zdim;
  qudaGaugeParam.X[3] = tdim;

  setDims(qudaGaugeParam.X);
  
  QudaPrecision link_prec = QUDA_SINGLE_PRECISION;
  QudaReconstructType link_recon = QUDA_RECONSTRUCT_NO;

  qudaGaugeParam.cpu_prec  = link_prec;
  qudaGaugeParam.cuda_prec = link_prec;
  qudaGaugeParam.reconstruct = link_recon;
  qudaGaugeParam.type = QUDA_WILSON_LINKS;


  hisq::fermion_force::hisqForceInitCuda(&qudaGaugeParam);
  
  qudaGaugeParam.t_boundary  	   = QUDA_PERIODIC_T;
  qudaGaugeParam.anisotropy  	   = 1.0;
  qudaGaugeParam.cuda_prec_sloppy   = prec;
  qudaGaugeParam.reconstruct_sloppy = QUDA_RECONSTRUCT_NO;
  qudaGaugeParam.gauge_fix   	   = QUDA_GAUGE_FIXED_NO;
  qudaGaugeParam.ga_pad      	   = 0;
  qudaGaugeParam.packed_size 	   = 0;
  qudaGaugeParam.gaugeGiB    	   = 0;
  qudaGaugeParam.flag              = false;

   
  qudaGaugeParam.cpu_prec = cpu_prec;
  qudaGaugeParam.cuda_prec = prec;
  qudaGaugeParam.gauge_order = gauge_order;
  qudaGaugeParam.type=QUDA_WILSON_LINKS;
  qudaGaugeParam.reconstruct = link_recon;
  qudaGaugeParam.flag = QUDA_FAT_PRESERVE_CPU_GAUGE
    | QUDA_FAT_PRESERVE_GPU_GAUGE
    | QUDA_FAT_PRESERVE_COMM_MEM;

  setFatLinkPadding(QUDA_COMPUTE_FAT_STANDARD, &qudaGaugeParam);
 
  GaugeFieldParam gParam(0, qudaGaugeParam);
  gParam.pad = 0;
  gParam.create    = QUDA_REFERENCE_FIELD_CREATE;
  gParam.link_type = QUDA_WILSON_LINKS;
  gParam.order     = QUDA_MILC_GAUGE_ORDER;
  cpuGaugeField *cpuOutLink  = new cpuGaugeField(gParam);

  gParam.pad         = 0;
  gParam.create      = QUDA_NULL_FIELD_CREATE;
  gParam.link_type   = QUDA_WILSON_LINKS;
  gParam.order       = QUDA_QDP_GAUGE_ORDER;
  gParam.reconstruct = QUDA_RECONSTRUCT_NO;
  cudaGaugeField *cudaFatLink = new cudaGaugeField(gParam);
  cudaGaugeField *cudaULink   = new cudaGaugeField(gParam);  

  initCommonConstants(*cudaFatLink);

  void* fatlink = (void*)malloc(4*V*gaugeSiteSize*gSize);
  if(fatlink == NULL){
    errorQuda("ERROR: allocating fatlink failed\n");
  }
  
  void* sitelink[4];
  for(int i=0;i < 4;i++){
    cudaMallocHost((void**)&sitelink[i], V*gaugeSiteSize*gSize);
    if(sitelink[i] == NULL){
      errorQuda("ERROR; allocate sitelink[%d] failed\n", i);
    }
  }
  
  createSiteLinkCPU(sitelink, qudaGaugeParam.cpu_prec, 1);

  double act_path_coeff[6];
  act_path_coeff[0] = 0.625000;
  act_path_coeff[1] = -0.058479;
  act_path_coeff[2] = -0.087719;
  act_path_coeff[3] = 0.030778;
  act_path_coeff[4] = -0.007200;
  act_path_coeff[5] = -0.123113;


  //only record the last call's performance
  //the first one is for creating the cpu/cuda data structures
  
  if(gauge_order == QUDA_QDP_GAUGE_ORDER){
    computeFatLinkQuda(fatlink, sitelink, act_path_coeff, &qudaGaugeParam,
			   QUDA_COMPUTE_FAT_STANDARD);
  } // gauge order is QDP_GAUGE_ORDER

  cpuOutLink->setGauge((void**)fatlink);
  cudaFatLink->loadCPUField(*cpuOutLink, QUDA_CPU_FIELD_LOCATION);
 

 
  hisq::setUnitarizeLinksConstants(unitarize_eps,
				   max_allowed_error,
				   reunit_allow_svd,
				   reunit_svd_only,
				   svd_rel_error,
				   svd_abs_error);
 
  hisq::setUnitarizeLinksPadding(0,0);

  int* num_failures_dev;
  cudaMalloc(&num_failures_dev, sizeof(int));
  cudaMemset(num_failures_dev, 0, sizeof(int));

  struct timeval t0, t1;

  gettimeofday(&t0,NULL);
  hisq::unitarizeLinksCuda(qudaGaugeParam,*cudaFatLink, cudaULink, num_failures_dev);
  cudaThreadSynchronize();
  gettimeofday(&t1,NULL);

  int num_failures=0;
  cudaMemcpy(&num_failures, num_failures_dev, sizeof(int), cudaMemcpyDeviceToHost);

 delete cudaFatLink;
 delete cudaULink;
 for(int dir=0; dir<4; ++dir) cudaFreeHost(sitelink[dir]);
  cudaFree(num_failures_dev); 
#ifdef MULTI_GPU
  exchange_llfat_cleanup();
#endif
  endQuda();
   
  printfQuda("Unitarization time: %g ms\n", TDIFF(t0,t1)*1000); 
  return num_failures;
}
Exemplo n.º 2
0
static void
gauge_force_test(void) 
{
  int max_length = 6;    
  
  initQuda(device);
  setVerbosityQuda(QUDA_VERBOSE,"",stdout);

  qudaGaugeParam = newQudaGaugeParam();
  
  qudaGaugeParam.X[0] = xdim;
  qudaGaugeParam.X[1] = ydim;
  qudaGaugeParam.X[2] = zdim;
  qudaGaugeParam.X[3] = tdim;
  
  setDims(qudaGaugeParam.X);
  
  qudaGaugeParam.anisotropy = 1.0;
  qudaGaugeParam.cpu_prec = link_prec;
  qudaGaugeParam.cuda_prec = link_prec;
  qudaGaugeParam.cuda_prec_sloppy = link_prec;
  qudaGaugeParam.reconstruct = link_recon;  
  qudaGaugeParam.reconstruct_sloppy = link_recon;  
  qudaGaugeParam.type = QUDA_SU3_LINKS; // in this context, just means these are site links   
  
  qudaGaugeParam.gauge_order = gauge_order;
  qudaGaugeParam.t_boundary = QUDA_PERIODIC_T;
  qudaGaugeParam.gauge_fix = QUDA_GAUGE_FIXED_NO;
  qudaGaugeParam.ga_pad = 0;
  qudaGaugeParam.mom_ga_pad = 0;

  size_t gSize = qudaGaugeParam.cpu_prec;
    
  void* sitelink;  
  void* sitelink_1d;
  
#ifdef GPU_DIRECT
  sitelink_1d = pinned_malloc(4*V*gaugeSiteSize*gSize);
#else
  sitelink_1d = safe_malloc(4*V*gaugeSiteSize*gSize);
#endif
  
  // this is a hack to have site link generated in 2d 
  // then copied to 1d array in "MILC" format
  void* sitelink_2d[4];
#ifdef GPU_DIRECT
  for(int i=0;i<4;i++) sitelink_2d[i] = pinned_malloc(V*gaugeSiteSize*qudaGaugeParam.cpu_prec); 
#else
  for(int i=0;i<4;i++) sitelink_2d[i] = safe_malloc(V*gaugeSiteSize*qudaGaugeParam.cpu_prec);
#endif
  
  // fills the gauge field with random numbers
  createSiteLinkCPU(sitelink_2d, qudaGaugeParam.cpu_prec, 0);
  
  //copy the 2d sitelink to 1d milc format 
  
  for(int dir = 0; dir < 4; dir++){
    for(int i=0; i < V; i++){
      char* src =  ((char*)sitelink_2d[dir]) + i * gaugeSiteSize* qudaGaugeParam.cpu_prec;
      char* dst =  ((char*)sitelink_1d) + (4*i+dir)*gaugeSiteSize*qudaGaugeParam.cpu_prec ;
      memcpy(dst, src, gaugeSiteSize*qudaGaugeParam.cpu_prec);
    }
  }
  if (qudaGaugeParam.gauge_order ==  QUDA_MILC_GAUGE_ORDER){ 
    sitelink =  sitelink_1d;    
  }else if (qudaGaugeParam.gauge_order == QUDA_QDP_GAUGE_ORDER) {
    sitelink = (void**)sitelink_2d;
  } else {
    errorQuda("Unsupported gauge order %d", qudaGaugeParam.gauge_order);
  }
  
#ifdef MULTI_GPU
  void* sitelink_ex_2d[4];
  void* sitelink_ex_1d;

  sitelink_ex_1d = pinned_malloc(4*V_ex*gaugeSiteSize*gSize);
  for(int i=0;i < 4;i++) sitelink_ex_2d[i] = pinned_malloc(V_ex*gaugeSiteSize*gSize);

  int X1= Z[0];
  int X2= Z[1];
  int X3= Z[2];
  int X4= Z[3];

  for(int i=0; i < V_ex; i++){
    int sid = i;
    int oddBit=0;
    if(i >= Vh_ex){
      sid = i - Vh_ex;
      oddBit = 1;
    }
    
    int za = sid/E1h;
    int x1h = sid - za*E1h;
    int zb = za/E2;
    int x2 = za - zb*E2;
    int x4 = zb/E3;
    int x3 = zb - x4*E3;
    int x1odd = (x2 + x3 + x4 + oddBit) & 1;
    int x1 = 2*x1h + x1odd;

    if( x1< 2 || x1 >= X1 +2
        || x2< 2 || x2 >= X2 +2
        || x3< 2 || x3 >= X3 +2
        || x4< 2 || x4 >= X4 +2){
      continue;
    }
    
    x1 = (x1 - 2 + X1) % X1;
    x2 = (x2 - 2 + X2) % X2;
    x3 = (x3 - 2 + X3) % X3;
    x4 = (x4 - 2 + X4) % X4;
    
    int idx = (x4*X3*X2*X1+x3*X2*X1+x2*X1+x1)>>1;
    if(oddBit){
      idx += Vh;
    }
    for(int dir= 0; dir < 4; dir++){
      char* src = (char*)sitelink_2d[dir];
      char* dst = (char*)sitelink_ex_2d[dir];
      memcpy(dst+i*gaugeSiteSize*gSize, src+idx*gaugeSiteSize*gSize, gaugeSiteSize*gSize);
    }//dir
  }//i
  
  
  for(int dir = 0; dir < 4; dir++){
    for(int i=0; i < V_ex; i++){
      char* src =  ((char*)sitelink_ex_2d[dir]) + i * gaugeSiteSize* qudaGaugeParam.cpu_prec;
      char* dst =  ((char*)sitelink_ex_1d) + (4*i+dir)*gaugeSiteSize*qudaGaugeParam.cpu_prec ;
      memcpy(dst, src, gaugeSiteSize*qudaGaugeParam.cpu_prec);
    }
  }
  
#endif

  void* mom = safe_malloc(4*V*momSiteSize*gSize);
  void* refmom = safe_malloc(4*V*momSiteSize*gSize);

  memset(mom, 0, 4*V*momSiteSize*gSize);
  //initialize some data in cpuMom
  createMomCPU(mom, qudaGaugeParam.cpu_prec);      
  memcpy(refmom, mom, 4*V*momSiteSize*gSize);
  
  
  double loop_coeff_d[sizeof(loop_coeff_f)/sizeof(float)];
  for(unsigned int i=0;i < sizeof(loop_coeff_f)/sizeof(float); i++){
    loop_coeff_d[i] = loop_coeff_f[i];
  }
    
  void* loop_coeff;
  if(qudaGaugeParam.cuda_prec == QUDA_SINGLE_PRECISION){
    loop_coeff = (void*)&loop_coeff_f[0];
  }else{
    loop_coeff = loop_coeff_d;
  }
  double eb3 = 0.3;
  int num_paths = sizeof(path_dir_x)/sizeof(path_dir_x[0]);
  
  int** input_path_buf[4];
  for(int dir =0; dir < 4; dir++){
    input_path_buf[dir] = (int**)safe_malloc(num_paths*sizeof(int*));    
    for(int i=0;i < num_paths;i++){
      input_path_buf[dir][i] = (int*)safe_malloc(length[i]*sizeof(int));
      if(dir == 0) memcpy(input_path_buf[dir][i], path_dir_x[i], length[i]*sizeof(int));
      else if(dir ==1) memcpy(input_path_buf[dir][i], path_dir_y[i], length[i]*sizeof(int));
      else if(dir ==2) memcpy(input_path_buf[dir][i], path_dir_z[i], length[i]*sizeof(int));
      else if(dir ==3) memcpy(input_path_buf[dir][i], path_dir_t[i], length[i]*sizeof(int));
    }
  }

  if (tune) {
    printfQuda("Tuning...\n");
    setTuning(QUDA_TUNE_YES);
  }
  
  struct timeval t0, t1;
  double timeinfo[3];
  /* Multiple execution to exclude warmup time in the first run*/
  for (int i =0;i < attempts; i++){
    gettimeofday(&t0, NULL);
    computeGaugeForceQuda(mom, sitelink,  input_path_buf, length,
			  loop_coeff_d, num_paths, max_length, eb3,
			  &qudaGaugeParam, timeinfo);
    gettimeofday(&t1, NULL);
  }
 
  double total_time = t1.tv_sec - t0.tv_sec + 0.000001*(t1.tv_usec - t0.tv_usec);
  //The number comes from CPU implementation in MILC, gauge_force_imp.c
  int flops=153004;
    
  if (verify_results){	
    for(int i = 0;i < attempts;i++){
#ifdef MULTI_GPU
      //last arg=0 means no optimization for communication, i.e. exchange data in all directions 
      //even they are not partitioned
      int R[4] = {2, 2, 2, 2};
      exchange_cpu_sitelink_ex(qudaGaugeParam.X, R, (void**)sitelink_ex_2d,
			       QUDA_QDP_GAUGE_ORDER, qudaGaugeParam.cpu_prec, 0, 4);    
      gauge_force_reference(refmom, eb3, sitelink_2d, sitelink_ex_2d, qudaGaugeParam.cpu_prec,
			    input_path_buf, length, loop_coeff, num_paths);
#else
      gauge_force_reference(refmom, eb3, sitelink_2d, NULL, qudaGaugeParam.cpu_prec,
			    input_path_buf, length, loop_coeff, num_paths);
#endif
    }
  
    int res;
    res = compare_floats(mom, refmom, 4*V*momSiteSize, 1e-3, qudaGaugeParam.cpu_prec);
    
    strong_check_mom(mom, refmom, 4*V, qudaGaugeParam.cpu_prec);
    
    printf("Test %s\n",(1 == res) ? "PASSED" : "FAILED");
  }  

  double perf = 1.0* flops*V/(total_time*1e+9);
  double kernel_perf = 1.0*flops*V/(timeinfo[1]*1e+9);
  printf("init and cpu->gpu time: %.2f ms, kernel time: %.2f ms, gpu->cpu and cleanup time: %.2f  total time =%.2f ms\n", 
	 timeinfo[0]*1e+3, timeinfo[1]*1e+3, timeinfo[2]*1e+3, total_time*1e+3);
  printf("kernel performance: %.2f GFLOPS, overall performance : %.2f GFLOPS\n", kernel_perf, perf);
  
  for(int dir = 0; dir < 4; dir++){
    for(int i=0;i < num_paths; i++) host_free(input_path_buf[dir][i]);
    host_free(input_path_buf[dir]);
  }
  
  host_free(sitelink_1d);
  for(int dir=0;dir < 4;dir++) host_free(sitelink_2d[dir]);
  
#ifdef MULTI_GPU  
  host_free(sitelink_ex_1d);
  for(int dir=0; dir < 4; dir++) host_free(sitelink_ex_2d[dir]);
#endif


  host_free(mom);
  host_free(refmom);
  endQuda();
}            
Exemplo n.º 3
0
  static void
llfat_test(int test)
{

  QudaGaugeParam qudaGaugeParam;
#ifdef MULTI_GPU
  void* ghost_sitelink[4];
  void* ghost_sitelink_diag[16];
#endif


  initQuda(device);

  cpu_prec = prec;
  gSize = cpu_prec;  
  qudaGaugeParam = newQudaGaugeParam();

  qudaGaugeParam.anisotropy = 1.0;

  qudaGaugeParam.X[0] = xdim;
  qudaGaugeParam.X[1] = ydim;
  qudaGaugeParam.X[2] = zdim;
  qudaGaugeParam.X[3] = tdim;

  setDims(qudaGaugeParam.X);

  qudaGaugeParam.cpu_prec = cpu_prec;
  qudaGaugeParam.cuda_prec = prec;
  qudaGaugeParam.gauge_order = gauge_order;
  qudaGaugeParam.type=QUDA_WILSON_LINKS;
  qudaGaugeParam.reconstruct = link_recon;
  /*
     qudaGaugeParam.flag = QUDA_FAT_PRESERVE_CPU_GAUGE
     | QUDA_FAT_PRESERVE_GPU_GAUGE
     | QUDA_FAT_PRESERVE_COMM_MEM;
     */
  qudaGaugeParam.preserve_gauge =0;
  void* fatlink;
  if (cudaMallocHost((void**)&fatlink, 4*V*gaugeSiteSize*gSize) == cudaErrorMemoryAllocation) {
    errorQuda("ERROR: cudaMallocHost failed for fatlink\n");
  }

  void* longlink;
  if (cudaMallocHost((void**)&longlink, 4*V*gaugeSiteSize*gSize) == cudaErrorMemoryAllocation) {
    errorQuda("ERROR: cudaMallocHost failed for longlink\n");
  } // page-locked memory

  void* sitelink[4];
  for(int i=0;i < 4;i++){
    if (cudaMallocHost((void**)&sitelink[i], V*gaugeSiteSize*gSize) == cudaErrorMemoryAllocation) {
      errorQuda("ERROR: cudaMallocHost failed for sitelink\n");
    }
  }

  void* sitelink_ex[4];
  for(int i=0;i < 4;i++){
    if (cudaMallocHost((void**)&sitelink_ex[i], V_ex*gaugeSiteSize*gSize) ==  cudaErrorMemoryAllocation) {
      errorQuda("ERROR: cudaMallocHost failed for sitelink_ex\n");
    }
  }


  void* milc_sitelink;
  milc_sitelink = (void*)malloc(4*V*gaugeSiteSize*gSize);
  if(milc_sitelink == NULL){
    errorQuda("ERROR: allocating milc_sitelink failed\n");
  }

  void* milc_sitelink_ex;
  milc_sitelink_ex = (void*)malloc(4*V_ex*gaugeSiteSize*gSize);
  if(milc_sitelink_ex == NULL){
    errorQuda("Error: allocating milc_sitelink failed\n");
  }



  createSiteLinkCPU(sitelink, qudaGaugeParam.cpu_prec, 1);

  if(gauge_order == QUDA_MILC_GAUGE_ORDER){
    for(int i=0; i<V; ++i){
      for(int dir=0; dir<4; ++dir){
        char* src = (char*)sitelink[dir];
        memcpy((char*)milc_sitelink + (i*4 + dir)*gaugeSiteSize*gSize, src+i*gaugeSiteSize*gSize, gaugeSiteSize*gSize);
      }	
    }
  }

  int X1=Z[0];
  int X2=Z[1];
  int X3=Z[2];
  int X4=Z[3];

  for(int i=0; i < V_ex; i++){
    int sid = i;
    int oddBit=0;
    if(i >= Vh_ex){
      sid = i - Vh_ex;
      oddBit = 1;
    }

    int za = sid/E1h;
    int x1h = sid - za*E1h;
    int zb = za/E2;
    int x2 = za - zb*E2;
    int x4 = zb/E3;
    int x3 = zb - x4*E3;
    int x1odd = (x2 + x3 + x4 + oddBit) & 1;
    int x1 = 2*x1h + x1odd;


    if( x1< 2 || x1 >= X1 +2
        || x2< 2 || x2 >= X2 +2
        || x3< 2 || x3 >= X3 +2
        || x4< 2 || x4 >= X4 +2){
#ifdef MULTI_GPU
      continue;
#endif
    }



    x1 = (x1 - 2 + X1) % X1;
    x2 = (x2 - 2 + X2) % X2;
    x3 = (x3 - 2 + X3) % X3;
    x4 = (x4 - 2 + X4) % X4;

    int idx = (x4*X3*X2*X1+x3*X2*X1+x2*X1+x1)>>1;
    if(oddBit){
      idx += Vh;
    }
    for(int dir= 0; dir < 4; dir++){
      char* src = (char*)sitelink[dir];
      char* dst = (char*)sitelink_ex[dir];
      memcpy(dst+i*gaugeSiteSize*gSize, src+idx*gaugeSiteSize*gSize, gaugeSiteSize*gSize);

      // milc ordering 
      memcpy((char*)milc_sitelink_ex + (i*4 + dir)*gaugeSiteSize*gSize, src+idx*gaugeSiteSize*gSize, gaugeSiteSize*gSize);
    }//dir
  }//i


  double act_path_coeff[6];
  for(int i=0;i < 6;i++){
    act_path_coeff[i]= 0.1*i;
  }


  //only record the last call's performance
  //the first one is for creating the cpu/cuda data structures
  struct timeval t0, t1;


  void** sitelink_ptr; 
  QudaComputeFatMethod method = (test) ? QUDA_COMPUTE_FAT_EXTENDED_VOLUME : QUDA_COMPUTE_FAT_STANDARD;
  if(gauge_order == QUDA_QDP_GAUGE_ORDER){
    sitelink_ptr = (test) ? (void**)sitelink_ex : (void**)sitelink;
  }else{
    sitelink_ptr = (test) ? (void**)milc_sitelink_ex : (void**)milc_sitelink;
  }
  void* longlink_ptr = longlink;
#ifdef MULTI_GPU
  if(!test) longlink_ptr = NULL; // Have to have an extended volume for the long-link calculation
#endif

  gettimeofday(&t0, NULL);
  computeKSLinkQuda(fatlink, longlink_ptr, NULL, milc_sitelink, act_path_coeff, &qudaGaugeParam, method);
  gettimeofday(&t1, NULL);


  double secs = TDIFF(t0,t1);

  void* fat_reflink[4];
  void* long_reflink[4];
  for(int i=0;i < 4;i++){
    fat_reflink[i] = malloc(V*gaugeSiteSize*gSize);
    if(fat_reflink[i] == NULL){
      errorQuda("ERROR; allocate fat_reflink[%d] failed\n", i);
    }
    long_reflink[i] = malloc(V*gaugeSiteSize*gSize);
    if(long_reflink[i] == NULL) errorQuda("ERROR; allocate long_reflink[%d] failed\n", i);
  }

  if (verify_results){

    //FIXME: we have this compplication because references takes coeff as float/double 
    //        depending on the precision while the GPU code aways take coeff as double
    void* coeff;
    double coeff_dp[6];
    float  coeff_sp[6];
    for(int i=0;i < 6;i++){
      coeff_sp[i] = coeff_dp[i] = act_path_coeff[i];
    }
    if(prec == QUDA_DOUBLE_PRECISION){
      coeff = coeff_dp;
    }else{
      coeff = coeff_sp;
    }
#ifdef MULTI_GPU
    int optflag = 0;
    //we need x,y,z site links in the back and forward T slice
    // so it is 3*2*Vs_t
    int Vs[4] = {Vs_x, Vs_y, Vs_z, Vs_t};
    for(int i=0;i < 4; i++){
      ghost_sitelink[i] = malloc(8*Vs[i]*gaugeSiteSize*gSize);
      if (ghost_sitelink[i] == NULL){
        printf("ERROR: malloc failed for ghost_sitelink[%d] \n",i);
        exit(1);
      }
    }

    /*
       nu |     |
       |_____|
       mu
       */

    for(int nu=0;nu < 4;nu++){
      for(int mu=0; mu < 4;mu++){
        if(nu == mu){
          ghost_sitelink_diag[nu*4+mu] = NULL;
        }else{
          //the other directions
          int dir1, dir2;
          for(dir1= 0; dir1 < 4; dir1++){
            if(dir1 !=nu && dir1 != mu){
              break;
            }
          }
          for(dir2=0; dir2 < 4; dir2++){
            if(dir2 != nu && dir2 != mu && dir2 != dir1){
              break;
            }
          }
          ghost_sitelink_diag[nu*4+mu] = malloc(Z[dir1]*Z[dir2]*gaugeSiteSize*gSize);
          if(ghost_sitelink_diag[nu*4+mu] == NULL){
            errorQuda("malloc failed for ghost_sitelink_diag\n");
          }

          memset(ghost_sitelink_diag[nu*4+mu], 0, Z[dir1]*Z[dir2]*gaugeSiteSize*gSize);
        }

      }
    }

    exchange_cpu_sitelink(qudaGaugeParam.X, sitelink, ghost_sitelink, ghost_sitelink_diag, qudaGaugeParam.cpu_prec, &qudaGaugeParam, optflag);
    llfat_reference_mg(fat_reflink, sitelink, ghost_sitelink, ghost_sitelink_diag, qudaGaugeParam.cpu_prec, coeff);
  

    {
      int R[4] = {2,2,2,2};
      exchange_cpu_sitelink_ex(qudaGaugeParam.X, R, sitelink_ex, QUDA_QDP_GAUGE_ORDER, qudaGaugeParam.cpu_prec, 0, 4);
      computeLongLinkCPU(long_reflink, sitelink_ex, qudaGaugeParam.cpu_prec, coeff);
    }
#else
    llfat_reference(fat_reflink, sitelink, qudaGaugeParam.cpu_prec, coeff);
    computeLongLinkCPU(long_reflink, sitelink, qudaGaugeParam.cpu_prec, coeff);
#endif

  }//verify_results

  //format change for fatlink and longlink
  void* myfatlink[4];
  void* mylonglink[4];
  for(int i=0;i < 4;i++){
    myfatlink[i] = malloc(V*gaugeSiteSize*gSize);
    if(myfatlink[i] == NULL){
      printf("Error: malloc failed for myfatlink[%d]\n", i);
      exit(1);
    }
    mylonglink[i] = malloc(V*gaugeSiteSize*gSize);
    if(mylonglink[i] == NULL){
      printf("Error: malloc failed for mylonglink[%d]\n", i);
      exit(1);
    }
    memset(myfatlink[i], 0, V*gaugeSiteSize*gSize);
    memset(mylonglink[i], 0, V*gaugeSiteSize*gSize);
  }

  for(int i=0;i < V; i++){
    for(int dir=0; dir< 4; dir++){
      char* src = ((char*)fatlink)+ (4*i+dir)*gaugeSiteSize*gSize;
      char* dst = ((char*)myfatlink[dir]) + i*gaugeSiteSize*gSize;
      memcpy(dst, src, gaugeSiteSize*gSize);

      src = ((char*)longlink)+ (4*i+dir)*gaugeSiteSize*gSize;
      dst = ((char*)mylonglink[dir]) + i*gaugeSiteSize*gSize;
      memcpy(dst, src, gaugeSiteSize*gSize);
    }
  }

  if (verify_results) {
    printfQuda("Checking fat links...\n");
    int res=1;
    for(int dir=0; dir<4; dir++){
      res &= compare_floats(fat_reflink[dir], myfatlink[dir], V*gaugeSiteSize, 1e-3, qudaGaugeParam.cpu_prec);
    }
    
    strong_check_link(myfatlink, "GPU results: ",
		      fat_reflink, "CPU reference results:",
		      V, qudaGaugeParam.cpu_prec);
    
    printfQuda("Fat-link test %s\n\n",(1 == res) ? "PASSED" : "FAILED");
#ifdef MULTI_GPU
    if(test){
#endif
      printfQuda("Checking long links...\n");
      res = 1;
      for(int dir=0; dir<4; ++dir){
	res &= compare_floats(long_reflink[dir], mylonglink[dir], V*gaugeSiteSize, 1e-3, qudaGaugeParam.cpu_prec);
      }
      
      strong_check_link(mylonglink, "GPU results: ",
			long_reflink, "CPU reference results:",
			V, qudaGaugeParam.cpu_prec);
      
      printfQuda("Long-link test %s\n\n",(1 == res) ? "PASSED" : "FAILED");
      
#ifdef MULTI_GPU
    }else{ // !test
      printfQuda("Extended volume is required for multi-GPU long-link construction\n");
    }
#endif
  }

  int volume = qudaGaugeParam.X[0]*qudaGaugeParam.X[1]*qudaGaugeParam.X[2]*qudaGaugeParam.X[3];
  int flops= 61632;
#ifdef MULTI_GPU
  if(test) flops += (252*4); // long-link contribution 
#else
  flops += (252*4); // 2*117 + 18 (two matrix-matrix multiplications and a matrix rescale)
#endif

  double perf = 1.0* flops*volume/(secs*1024*1024*1024);
  printfQuda("link computation time =%.2f ms, flops= %.2f Gflops\n", secs*1000, perf);


  for(int i=0;i < 4;i++){
    free(myfatlink[i]);
  }

#ifdef MULTI_GPU
  if (verify_results){
    int i;
    for(i=0;i < 4;i++){
      free(ghost_sitelink[i]);
    }
    for(i=0;i <4; i++){
      for(int j=0;j <4; j++){
        if (i==j){
          continue;
        }
        free(ghost_sitelink_diag[i*4+j]);
      }
    }
  }
#endif

  for(int i=0;i < 4; i++){
    cudaFreeHost(sitelink[i]);
    cudaFreeHost(sitelink_ex[i]);
    free(fat_reflink[i]);
  }
  cudaFreeHost(fatlink);
  cudaFreeHost(longlink);
  if(milc_sitelink) free(milc_sitelink);
  if(milc_sitelink_ex) free(milc_sitelink_ex);
#ifdef MULTI_GPU
  exchange_llfat_cleanup();
#endif
  endQuda();
}