Exemplo n.º 1
0
  void loadLinkToGPU(cudaGaugeField* cudaGauge, cpuGaugeField* cpuGauge, QudaGaugeParam* param)
  {
    if (cudaGauge->Precision() != cpuGauge->Precision()){
      errorQuda("Mismatch between CPU precision and CUDA precision");
    }
    QudaPrecision prec = cudaGauge->Precision();

#ifdef MULTI_GPU
    const int* Z = cudaGauge->X();
#endif
    int pad = cudaGauge->Pad();
    int Vsh_x = param->X[1]*param->X[2]*param->X[3]/2;
    int Vsh_y = param->X[0]*param->X[2]*param->X[3]/2;
    int Vsh_z = param->X[0]*param->X[1]*param->X[3]/2;
    int Vsh_t = param->X[0]*param->X[1]*param->X[2]/2;

    static void* ghost_cpuGauge[4];
    static void* ghost_cpuGauge_diag[16];

#ifdef MULTI_GPU
    static int allocated = 0;
    int Vs[4] = {2*Vsh_x, 2*Vsh_y, 2*Vsh_z, 2*Vsh_t};
  
    if (!allocated) {

      for(int i=0;i < 4; i++) {
	size_t ghost_bytes = 8*Vs[i]*gaugeSiteSize*prec;      
#ifdef GPU_DIRECT 
	ghost_cpuGauge[i] = pinned_malloc(ghost_bytes);
#else
	ghost_cpuGauge[i] = safe_malloc(ghost_bytes);
#endif
      }

      /*
       *  nu |     |
       *     |_____|
       *       mu     
       */
    
      for(int nu=0;nu < 4;nu++){
	for(int mu=0; mu < 4;mu++){
	  if(nu == mu){
	    ghost_cpuGauge_diag[nu*4+mu] = NULL;
	  }else{
	    //the other directions
	    int dir1, dir2;
	    for(dir1= 0; dir1 < 4; dir1++){
	      if(dir1 !=nu && dir1 != mu){
		break;
	      }
	    }
	    for(dir2=0; dir2 < 4; dir2++){
	      if(dir2 != nu && dir2 != mu && dir2 != dir1){
		break;
	      }
	    }
	    //int rc = posix_memalign((void**)&ghost_cpuGauge_diag[nu*4+mu], ALIGNMENT, Z[dir1]*Z[dir2]*gaugeSiteSize*prec);

	    size_t nbytes = Z[dir1]*Z[dir2]*gaugeSiteSize*prec;
#ifdef GPU_DIRECT 
	    ghost_cpuGauge_diag[nu*4+mu] = pinned_malloc(nbytes);
#else
	    ghost_cpuGauge_diag[nu*4+mu] = safe_malloc(nbytes);
#endif
	    memset(ghost_cpuGauge_diag[nu*4+mu], 0, nbytes);
	  }	
	}
      }
      allocated = 1;
    }

    int optflag=1;
    // driver for for packalllink
    exchange_cpu_sitelink(param->X, (void**)cpuGauge->Gauge_p(), ghost_cpuGauge, ghost_cpuGauge_diag, prec, param, optflag);
  
#endif
  
    do_loadLinkToGPU(param->X, cudaGauge->Even_p(), cudaGauge->Odd_p(), (void**)cpuGauge->Gauge_p(), 
		     ghost_cpuGauge, ghost_cpuGauge_diag, 
		     cudaGauge->Reconstruct(), cudaGauge->Bytes(), cudaGauge->VolumeCB(), pad, 
		     Vsh_x, Vsh_y, Vsh_z, Vsh_t, 
		     prec, cpuGauge->Order());
  
#ifdef MULTI_GPU
    if(!(param->preserve_gauge & QUDA_FAT_PRESERVE_COMM_MEM)) {
    
      for(int i=0;i < 4;i++){
	host_free(ghost_cpuGauge[i]);
      }
      for(int i=0;i <4; i++){ 
	for(int j=0;j <4; j++){
	  if (i != j) host_free(ghost_cpuGauge_diag[i*4+j]);
	}
      }    
      allocated = 0;
    }
#endif
  
  }
Exemplo n.º 2
0
  static void
llfat_test(int test)
{

  QudaGaugeParam qudaGaugeParam;
#ifdef MULTI_GPU
  void* ghost_sitelink[4];
  void* ghost_sitelink_diag[16];
#endif


  initQuda(device);

  cpu_prec = prec;
  gSize = cpu_prec;  
  qudaGaugeParam = newQudaGaugeParam();

  qudaGaugeParam.anisotropy = 1.0;

  qudaGaugeParam.X[0] = xdim;
  qudaGaugeParam.X[1] = ydim;
  qudaGaugeParam.X[2] = zdim;
  qudaGaugeParam.X[3] = tdim;

  setDims(qudaGaugeParam.X);

  qudaGaugeParam.cpu_prec = cpu_prec;
  qudaGaugeParam.cuda_prec = prec;
  qudaGaugeParam.gauge_order = gauge_order;
  qudaGaugeParam.type=QUDA_WILSON_LINKS;
  qudaGaugeParam.reconstruct = link_recon;
  /*
     qudaGaugeParam.flag = QUDA_FAT_PRESERVE_CPU_GAUGE
     | QUDA_FAT_PRESERVE_GPU_GAUGE
     | QUDA_FAT_PRESERVE_COMM_MEM;
     */
  qudaGaugeParam.preserve_gauge =0;
  void* fatlink;
  if (cudaMallocHost((void**)&fatlink, 4*V*gaugeSiteSize*gSize) == cudaErrorMemoryAllocation) {
    errorQuda("ERROR: cudaMallocHost failed for fatlink\n");
  }

  void* longlink;
  if (cudaMallocHost((void**)&longlink, 4*V*gaugeSiteSize*gSize) == cudaErrorMemoryAllocation) {
    errorQuda("ERROR: cudaMallocHost failed for longlink\n");
  } // page-locked memory

  void* sitelink[4];
  for(int i=0;i < 4;i++){
    if (cudaMallocHost((void**)&sitelink[i], V*gaugeSiteSize*gSize) == cudaErrorMemoryAllocation) {
      errorQuda("ERROR: cudaMallocHost failed for sitelink\n");
    }
  }

  void* sitelink_ex[4];
  for(int i=0;i < 4;i++){
    if (cudaMallocHost((void**)&sitelink_ex[i], V_ex*gaugeSiteSize*gSize) ==  cudaErrorMemoryAllocation) {
      errorQuda("ERROR: cudaMallocHost failed for sitelink_ex\n");
    }
  }


  void* milc_sitelink;
  milc_sitelink = (void*)malloc(4*V*gaugeSiteSize*gSize);
  if(milc_sitelink == NULL){
    errorQuda("ERROR: allocating milc_sitelink failed\n");
  }

  void* milc_sitelink_ex;
  milc_sitelink_ex = (void*)malloc(4*V_ex*gaugeSiteSize*gSize);
  if(milc_sitelink_ex == NULL){
    errorQuda("Error: allocating milc_sitelink failed\n");
  }



  createSiteLinkCPU(sitelink, qudaGaugeParam.cpu_prec, 1);

  if(gauge_order == QUDA_MILC_GAUGE_ORDER){
    for(int i=0; i<V; ++i){
      for(int dir=0; dir<4; ++dir){
        char* src = (char*)sitelink[dir];
        memcpy((char*)milc_sitelink + (i*4 + dir)*gaugeSiteSize*gSize, src+i*gaugeSiteSize*gSize, gaugeSiteSize*gSize);
      }	
    }
  }

  int X1=Z[0];
  int X2=Z[1];
  int X3=Z[2];
  int X4=Z[3];

  for(int i=0; i < V_ex; i++){
    int sid = i;
    int oddBit=0;
    if(i >= Vh_ex){
      sid = i - Vh_ex;
      oddBit = 1;
    }

    int za = sid/E1h;
    int x1h = sid - za*E1h;
    int zb = za/E2;
    int x2 = za - zb*E2;
    int x4 = zb/E3;
    int x3 = zb - x4*E3;
    int x1odd = (x2 + x3 + x4 + oddBit) & 1;
    int x1 = 2*x1h + x1odd;


    if( x1< 2 || x1 >= X1 +2
        || x2< 2 || x2 >= X2 +2
        || x3< 2 || x3 >= X3 +2
        || x4< 2 || x4 >= X4 +2){
#ifdef MULTI_GPU
      continue;
#endif
    }



    x1 = (x1 - 2 + X1) % X1;
    x2 = (x2 - 2 + X2) % X2;
    x3 = (x3 - 2 + X3) % X3;
    x4 = (x4 - 2 + X4) % X4;

    int idx = (x4*X3*X2*X1+x3*X2*X1+x2*X1+x1)>>1;
    if(oddBit){
      idx += Vh;
    }
    for(int dir= 0; dir < 4; dir++){
      char* src = (char*)sitelink[dir];
      char* dst = (char*)sitelink_ex[dir];
      memcpy(dst+i*gaugeSiteSize*gSize, src+idx*gaugeSiteSize*gSize, gaugeSiteSize*gSize);

      // milc ordering 
      memcpy((char*)milc_sitelink_ex + (i*4 + dir)*gaugeSiteSize*gSize, src+idx*gaugeSiteSize*gSize, gaugeSiteSize*gSize);
    }//dir
  }//i


  double act_path_coeff[6];
  for(int i=0;i < 6;i++){
    act_path_coeff[i]= 0.1*i;
  }


  //only record the last call's performance
  //the first one is for creating the cpu/cuda data structures
  struct timeval t0, t1;


  void** sitelink_ptr; 
  QudaComputeFatMethod method = (test) ? QUDA_COMPUTE_FAT_EXTENDED_VOLUME : QUDA_COMPUTE_FAT_STANDARD;
  if(gauge_order == QUDA_QDP_GAUGE_ORDER){
    sitelink_ptr = (test) ? (void**)sitelink_ex : (void**)sitelink;
  }else{
    sitelink_ptr = (test) ? (void**)milc_sitelink_ex : (void**)milc_sitelink;
  }
  void* longlink_ptr = longlink;
#ifdef MULTI_GPU
  if(!test) longlink_ptr = NULL; // Have to have an extended volume for the long-link calculation
#endif

  gettimeofday(&t0, NULL);
  computeKSLinkQuda(fatlink, longlink_ptr, NULL, milc_sitelink, act_path_coeff, &qudaGaugeParam, method);
  gettimeofday(&t1, NULL);


  double secs = TDIFF(t0,t1);

  void* fat_reflink[4];
  void* long_reflink[4];
  for(int i=0;i < 4;i++){
    fat_reflink[i] = malloc(V*gaugeSiteSize*gSize);
    if(fat_reflink[i] == NULL){
      errorQuda("ERROR; allocate fat_reflink[%d] failed\n", i);
    }
    long_reflink[i] = malloc(V*gaugeSiteSize*gSize);
    if(long_reflink[i] == NULL) errorQuda("ERROR; allocate long_reflink[%d] failed\n", i);
  }

  if (verify_results){

    //FIXME: we have this compplication because references takes coeff as float/double 
    //        depending on the precision while the GPU code aways take coeff as double
    void* coeff;
    double coeff_dp[6];
    float  coeff_sp[6];
    for(int i=0;i < 6;i++){
      coeff_sp[i] = coeff_dp[i] = act_path_coeff[i];
    }
    if(prec == QUDA_DOUBLE_PRECISION){
      coeff = coeff_dp;
    }else{
      coeff = coeff_sp;
    }
#ifdef MULTI_GPU
    int optflag = 0;
    //we need x,y,z site links in the back and forward T slice
    // so it is 3*2*Vs_t
    int Vs[4] = {Vs_x, Vs_y, Vs_z, Vs_t};
    for(int i=0;i < 4; i++){
      ghost_sitelink[i] = malloc(8*Vs[i]*gaugeSiteSize*gSize);
      if (ghost_sitelink[i] == NULL){
        printf("ERROR: malloc failed for ghost_sitelink[%d] \n",i);
        exit(1);
      }
    }

    /*
       nu |     |
       |_____|
       mu
       */

    for(int nu=0;nu < 4;nu++){
      for(int mu=0; mu < 4;mu++){
        if(nu == mu){
          ghost_sitelink_diag[nu*4+mu] = NULL;
        }else{
          //the other directions
          int dir1, dir2;
          for(dir1= 0; dir1 < 4; dir1++){
            if(dir1 !=nu && dir1 != mu){
              break;
            }
          }
          for(dir2=0; dir2 < 4; dir2++){
            if(dir2 != nu && dir2 != mu && dir2 != dir1){
              break;
            }
          }
          ghost_sitelink_diag[nu*4+mu] = malloc(Z[dir1]*Z[dir2]*gaugeSiteSize*gSize);
          if(ghost_sitelink_diag[nu*4+mu] == NULL){
            errorQuda("malloc failed for ghost_sitelink_diag\n");
          }

          memset(ghost_sitelink_diag[nu*4+mu], 0, Z[dir1]*Z[dir2]*gaugeSiteSize*gSize);
        }

      }
    }

    exchange_cpu_sitelink(qudaGaugeParam.X, sitelink, ghost_sitelink, ghost_sitelink_diag, qudaGaugeParam.cpu_prec, &qudaGaugeParam, optflag);
    llfat_reference_mg(fat_reflink, sitelink, ghost_sitelink, ghost_sitelink_diag, qudaGaugeParam.cpu_prec, coeff);
  

    {
      int R[4] = {2,2,2,2};
      exchange_cpu_sitelink_ex(qudaGaugeParam.X, R, sitelink_ex, QUDA_QDP_GAUGE_ORDER, qudaGaugeParam.cpu_prec, 0, 4);
      computeLongLinkCPU(long_reflink, sitelink_ex, qudaGaugeParam.cpu_prec, coeff);
    }
#else
    llfat_reference(fat_reflink, sitelink, qudaGaugeParam.cpu_prec, coeff);
    computeLongLinkCPU(long_reflink, sitelink, qudaGaugeParam.cpu_prec, coeff);
#endif

  }//verify_results

  //format change for fatlink and longlink
  void* myfatlink[4];
  void* mylonglink[4];
  for(int i=0;i < 4;i++){
    myfatlink[i] = malloc(V*gaugeSiteSize*gSize);
    if(myfatlink[i] == NULL){
      printf("Error: malloc failed for myfatlink[%d]\n", i);
      exit(1);
    }
    mylonglink[i] = malloc(V*gaugeSiteSize*gSize);
    if(mylonglink[i] == NULL){
      printf("Error: malloc failed for mylonglink[%d]\n", i);
      exit(1);
    }
    memset(myfatlink[i], 0, V*gaugeSiteSize*gSize);
    memset(mylonglink[i], 0, V*gaugeSiteSize*gSize);
  }

  for(int i=0;i < V; i++){
    for(int dir=0; dir< 4; dir++){
      char* src = ((char*)fatlink)+ (4*i+dir)*gaugeSiteSize*gSize;
      char* dst = ((char*)myfatlink[dir]) + i*gaugeSiteSize*gSize;
      memcpy(dst, src, gaugeSiteSize*gSize);

      src = ((char*)longlink)+ (4*i+dir)*gaugeSiteSize*gSize;
      dst = ((char*)mylonglink[dir]) + i*gaugeSiteSize*gSize;
      memcpy(dst, src, gaugeSiteSize*gSize);
    }
  }

  if (verify_results) {
    printfQuda("Checking fat links...\n");
    int res=1;
    for(int dir=0; dir<4; dir++){
      res &= compare_floats(fat_reflink[dir], myfatlink[dir], V*gaugeSiteSize, 1e-3, qudaGaugeParam.cpu_prec);
    }
    
    strong_check_link(myfatlink, "GPU results: ",
		      fat_reflink, "CPU reference results:",
		      V, qudaGaugeParam.cpu_prec);
    
    printfQuda("Fat-link test %s\n\n",(1 == res) ? "PASSED" : "FAILED");
#ifdef MULTI_GPU
    if(test){
#endif
      printfQuda("Checking long links...\n");
      res = 1;
      for(int dir=0; dir<4; ++dir){
	res &= compare_floats(long_reflink[dir], mylonglink[dir], V*gaugeSiteSize, 1e-3, qudaGaugeParam.cpu_prec);
      }
      
      strong_check_link(mylonglink, "GPU results: ",
			long_reflink, "CPU reference results:",
			V, qudaGaugeParam.cpu_prec);
      
      printfQuda("Long-link test %s\n\n",(1 == res) ? "PASSED" : "FAILED");
      
#ifdef MULTI_GPU
    }else{ // !test
      printfQuda("Extended volume is required for multi-GPU long-link construction\n");
    }
#endif
  }

  int volume = qudaGaugeParam.X[0]*qudaGaugeParam.X[1]*qudaGaugeParam.X[2]*qudaGaugeParam.X[3];
  int flops= 61632;
#ifdef MULTI_GPU
  if(test) flops += (252*4); // long-link contribution 
#else
  flops += (252*4); // 2*117 + 18 (two matrix-matrix multiplications and a matrix rescale)
#endif

  double perf = 1.0* flops*volume/(secs*1024*1024*1024);
  printfQuda("link computation time =%.2f ms, flops= %.2f Gflops\n", secs*1000, perf);


  for(int i=0;i < 4;i++){
    free(myfatlink[i]);
  }

#ifdef MULTI_GPU
  if (verify_results){
    int i;
    for(i=0;i < 4;i++){
      free(ghost_sitelink[i]);
    }
    for(i=0;i <4; i++){
      for(int j=0;j <4; j++){
        if (i==j){
          continue;
        }
        free(ghost_sitelink_diag[i*4+j]);
      }
    }
  }
#endif

  for(int i=0;i < 4; i++){
    cudaFreeHost(sitelink[i]);
    cudaFreeHost(sitelink_ex[i]);
    free(fat_reflink[i]);
  }
  cudaFreeHost(fatlink);
  cudaFreeHost(longlink);
  if(milc_sitelink) free(milc_sitelink);
  if(milc_sitelink_ex) free(milc_sitelink_ex);
#ifdef MULTI_GPU
  exchange_llfat_cleanup();
#endif
  endQuda();
}